scrapy requests for heute.at always with 403 responses

43 views Asked by At

im trying to scrape www.heute.at with scrapy for a personal data science project. im also using scrapy-rotating-procies with following Tor Proxies. However I only get 403 https://www.heute.at/>: HTTP status code is not handled or not allowed responses. I've also used Tor Browser to see if its possible to access the site via tor at all (yes, it is!) and then tried to imitate the tor borwsers request (copying its headers into scrapy), with no success. Please see my setup and attemts in detail below. Any help or leads are much appreciated:

this is my spider, it simply collects all links to articles, supposedly:

import scrapy
from scrapy.loader import ItemLoader
from HEUTE.items import heuteLinkItems
from dotenv import load_dotenv

class heuteLinks(scrapy.Spider):
    name = "heuteLinks"
    start_urls = ['https://www.heute.at/']

    # parses data
    def parse(self, response):
        for item in response.xpath('//*[contains(@class, "link")]/@href'):
            zacken = ItemLoader(item=heuteLinkItems(), selector=item)
            zacken.add_value('mainPage', response.url)
            zacken.add_value('link', item.get())
            yield zacken.load_item()
        for link in response.xpath('//*[contains(@class, "mainmenu")]//@href'):
            url = link.get()
            yield scrapy.Request(url, self.parse2)

    # parses data passed on from first parse
    def parse2(self, response):
        for item in response.xpath('//*[contains(@class, "link")]/@href'):
            zacken = ItemLoader(item=heuteLinkItems(), selector=item)
            zacken.add_value('mainPage', response.url)
            zacken.add_value('link', item.get())
            yield zacken.load_item()

the items.py:

import scrapy
from itemloaders.processors import TakeFirst, Join, MapCompose
from scrapy.exceptions import DropItem

def urlMaker(x):
    if '/s/' in x:
        return 'https://www.heute.at' + x

class heuteLinkItems(scrapy.Item):
    mainPage = scrapy.Field(output_processor=TakeFirst(),)
    link = scrapy.Field(input_processor=MapCompose(urlMaker), output_processor=TakeFirst())

the settings.py:

from dotenv import load_dotenv
import os
import random
load_dotenv("../SETUP/.env")
ip = os.environ.get("server_domain")

BOT_NAME = "HEUTE"

SPIDER_MODULES = ["HEUTE.spiders"]
NEWSPIDER_MODULE = "HEUTE.spiders"


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "HEUTE (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 1

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = random.randint(1,3)
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'",
    "Accept-Language": "en-US,en;q=0.5",
    'Cookie': 'ioam2018=00014b3f0e4ceb55c65081931:1725355954229:1695029554229:.heute.at:9:at_w_atheute:RedCont/Homepage/Homepage:noevent:1695047962431:g96nzp; dakt_2_uuid=76c9c244b122d37b4bfc4089ca8207a6; dakt_2_uuid_ts=1695029555113; dakt_2_version=2.1.61; _pbjs_userid_consent_data=3524755945110770; __gads=ID=a7625cd4974c024b:T=1695029556:RT=1695047964:S=ALNI_MYz08UbrntABhw-fNYFwC0Fve4kXQ; __gpi=UID=00000c782856d0ce:T=1695029556:RT=1695047964:S=ALNI_MZC5e8mon2kgCOPwmy8suXyIFzxEg; cto_bundle=MiDme19ZaUNLcUdlY0s1RUtYMG8lMkZCdll5Nkd4QXhvZXVvaCUyRml2cHAlMkIlMkZjUExoZnJTS3lWejMxUnNmT3hwYVNWcm1uMCUyRk8wVGhqREYySjdURjVmNHZ1bnNnJTJCcVZ1JTJCeDhFSWNtV1QxQSUyQldYMVY2dGFxNWp2MldvZ2g4aTElMkZJM2pnJTJCQlBz; dakt_2_session_id=1171e864c3d2baf83d6a6e6fad954d06',
    'Upgrade-Insecure-Requests': '1',
    'Connection': 'keep-alive',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Site': 'cross-site',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-User': '?1',
    'TE': 'trailers'
}



# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    'rotating_proxies.middlewares.RotatingProxyMiddleware': 610,
    'rotating_proxies.middlewares.BanDetectionMiddleware': 620,
}



# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False


# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

ROTATING_PROXY_LIST = [
    f'{ip}:8118',
    f'{ip}:8119',
    f'{ip}:8120'
]

ROTATING_PROXY_BAN_POLICY = 'HEUTE.policy.BanPolicy'

USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0" 

when running scrapy crawl heuteLinks i only get 403 https://www.heute.at/>: HTTP status code is not handled or not allowed responses:

2023-09-18 13:16:34 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'HEUTE',
 'CONCURRENT_REQUESTS': 1,
 'DOWNLOAD_DELAY': 1,
 'FEED_EXPORT_ENCODING': 'utf-8',
 'NEWSPIDER_MODULE': 'HEUTE.spiders',
 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
 'SPIDER_MODULES': ['HEUTE.spiders'],
 'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor',
 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 '
               'Firefox/102.0'}
2023-09-18 13:16:34 [asyncio] DEBUG: Using selector: EpollSelector
2023-09-18 13:16:34 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
2023-09-18 13:16:34 [scrapy.utils.log] DEBUG: Using asyncio event loop: asyncio.unix_events._UnixSelectorEventLoop
2023-09-18 13:16:34 [scrapy.extensions.telnet] INFO: Telnet Password: 825c4fdec07d4a54
2023-09-18 13:16:34 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats',
 'scrapy.extensions.throttle.AutoThrottle']
2023-09-18 13:16:34 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'rotating_proxies.middlewares.RotatingProxyMiddleware',
 'rotating_proxies.middlewares.BanDetectionMiddleware',
 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
 'scrapy.downloadermiddlewares.stats.DownloaderStats']
2023-09-18 13:16:34 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.spidermiddlewares.referer.RefererMiddleware',
 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
 'scrapy.spidermiddlewares.depth.DepthMiddleware']
2023-09-18 13:16:34 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2023-09-18 13:16:34 [scrapy.core.engine] INFO: Spider opened
2023-09-18 13:16:34 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2023-09-18 13:16:34 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2023-09-18 13:16:34 [rotating_proxies.middlewares] INFO: Proxies(good: 0, dead: 0, unchecked: 10, reanimated: 0, mean backoff time: 0s)
2023-09-18 13:16:35 [stem] DEBUG: GETCONF __owningcontrollerprocess (runtime: 0.0003)
2023-09-18 13:16:35 [rotating_proxies.expire] DEBUG: Proxy <http://my.proxy.link:8123> is DEAD
2023-09-18 13:16:35 [rotating_proxies.middlewares] DEBUG: Retrying <GET https://www.heute.at/> with another proxy (failed 1 times, max retries: 5)
2023-09-18 13:16:38 [rotating_proxies.expire] DEBUG: Proxy <http://my.proxy.link:8124> is DEAD
2023-09-18 13:16:38 [rotating_proxies.middlewares] DEBUG: Retrying <GET https://www.heute.at/> with another proxy (failed 2 times, max retries: 5)
2023-09-18 13:16:44 [rotating_proxies.middlewares] DEBUG: 1 proxies moved from 'dead' to 'reanimated'
2023-09-18 13:16:45 [rotating_proxies.expire] DEBUG: Proxy <http://my.proxy.link:8118> is DEAD
2023-09-18 13:16:45 [rotating_proxies.middlewares] DEBUG: Retrying <GET https://www.heute.at/> with another proxy (failed 3 times, max retries: 5)
2023-09-18 13:16:54 [rotating_proxies.expire] DEBUG: Proxy <http://my.proxy.link:8126> is DEAD
2023-09-18 13:16:54 [rotating_proxies.middlewares] DEBUG: Retrying <GET https://www.heute.at/> with another proxy (failed 4 times, max retries: 5)
2023-09-18 13:16:59 [rotating_proxies.expire] DEBUG: Proxy <http://my.proxy.link:8123> is DEAD
2023-09-18 13:16:59 [rotating_proxies.middlewares] DEBUG: Retrying <GET https://www.heute.at/> with another proxy (failed 5 times, max retries: 5)
2023-09-18 13:17:04 [rotating_proxies.middlewares] INFO: Proxies(good: 0, dead: 4, unchecked: 6, reanimated: 0, mean backoff time: 188s)
2023-09-18 13:17:06 [rotating_proxies.expire] DEBUG: Proxy <http://my.proxy.link:8127> is DEAD
2023-09-18 13:17:06 [rotating_proxies.middlewares] DEBUG: Gave up retrying <GET https://www.heute.at/> (failed 6 times with different proxies)
2023-09-18 13:17:06 [scrapy.core.engine] DEBUG: Crawled (403) <GET https://www.heute.at/> (referer: None)
2023-09-18 13:17:06 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <403 https://www.heute.at/>: HTTP status code is not handled or not allowed
2023-09-18 13:17:06 [scrapy.core.engine] INFO: Closing spider (finished)
2023-09-18 13:17:06 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'bans/status/403': 6,
 'downloader/request_bytes': 9396,
 'downloader/request_count': 6,
 'downloader/request_method_count/GET': 6,
 'downloader/response_bytes': 32301,
 'downloader/response_count': 6,
 'downloader/response_status_count/403': 6,
 'elapsed_time_seconds': 31.850255,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 9, 18, 11, 17, 6, 707687),
 'httpcompression/response_bytes': 6444,
 'httpcompression/response_count': 1,
 'httperror/response_ignored_count': 1,
 'httperror/response_ignored_status_count/403': 1,
 'log_count/DEBUG': 18,
 'log_count/INFO': 13,
 'memusage/max': 73097216,
 'memusage/startup': 73097216,
 'proxies/dead': 5,
 'proxies/mean_backoff': 188.23789066016286,
 'proxies/reanimated': 0,
 'proxies/unchecked': 6,
 'response_received_count': 1,
 'scheduler/dequeued': 6,
 'scheduler/dequeued/memory': 6,
 'scheduler/enqueued': 6,
 'scheduler/enqueued/memory': 6,
 'spider_name': 'heuteLinks',
 'start_time': datetime.datetime(2023, 9, 18, 11, 16, 34, 857432),
 'urls_failed': '',
 'urls_requested': ''}
2023-09-18 13:17:06 [scrapy.core.engine] INFO: Spider closed (finished)

the custom scrapy-rotating-proxies ban policy signals a proxy to switch curcuit / endnoce once its banned:

import scrapy
from rotating_proxies.policy import BanDetectionPolicy
from stem import Signal
from stem.control import Controller
import stem.util
from dotenv import load_dotenv
import os
import socket
load_dotenv("../SETUP/.env")

class BanPolicy(BanDetectionPolicy):
    def response_is_ban(self, request, response):
        ban = super(BanPolicy, self).response_is_ban(request, response) 
        address = socket.gethostbyname(os.environ.get('server_domain')) # getting proxy ip
        port = int(os.environ.get(f"torproxy_{request.meta.get('proxy')[-4:]}").split(",")[1]) # getting proxy control port
        with Controller.from_port(address=address, port=port) as controller: # connecting to proxy
            controller.authenticate(os.environ.get("torproxy_controller_pass")) # authenticating
            stem.util.log.get_logger().propagate = False # disable logging, as noise info log of stem pollutes scrapy log. currently as workaround based on: https://github.com/torproject/stem/issues/112#
            controller.signal(Signal.NEWNYM) # telling proxy to change curcuit / endpoint ip
            controller.close()
        return ban

when deactivating rotating proxies in the settings:

DOWNLOADER_MIDDLEWARES = {
    # 'rotating_proxies.middlewares.RotatingProxyMiddleware': 610,
    # 'rotating_proxies.middlewares.BanDetectionMiddleware': 620,
}

everything works fine. scrapy accesses the site and scrapes the items flawelessly.

the tor proxies are run via docker. the docker-compose.yml file:

version: '3'
services:
  tor_proxy_1:
    &proxy_template
    image: dperson/torproxy
    container_name: tor_proxy_1
    environment:
      - PASSWORD=${torproxy_controller_pass}
      - BW=0
      - EXITNOTE=0
      - TOR_NewCircuitPeriod=1
    ports:
      - 8118:8118
      - 9050:9050
      - 9051:9051 #control port
    networks:
      - scrapernetwork
    restart: unless-stopped

  tor_proxy_2:
    <<: *proxy_template
    container_name: tor_proxy_2
    ports:
    - 8119:8118
    - 9052:9050
    - 9053:9051 #control port

  tor_proxy_3:
    <<: *proxy_template
    container_name: tor_proxy_3
    ports:
      - 8120:8118
      - 9054:9050
      - 9055:9051 #control port

i tested accessing www.heute.at via the tor browser, to see whether it works at all. it does. i then, in the network tab of the Developer Tools copied the curl request of the html document, see screenshot below.

enter image description here

to reproduce on that level:

curl 'https://www.heute.at/' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' -H 'Accept-Language: en-US,en;q=0.5' -H 'Connection: keep-alive' -H 'Cookie: ioam2018=00014b3f0e4ceb55c65081931:1725355954229:1695029554229:.heute.at:9:at_w_atheute:RedCont/Homepage/Homepage:noevent:1695047962431:g96nzp; dakt_2_uuid=76c9c244b122d37b4bfc4089ca8207a6; dakt_2_uuid_ts=1695029555113; dakt_2_version=2.1.61; _pbjs_userid_consent_data=3524755945110770; __gads=ID=a7625cd4974c024b:T=1695029556:RT=1695047964:S=ALNI_MYz08UbrntABhw-fNYFwC0Fve4kXQ; __gpi=UID=00000c782856d0ce:T=1695029556:RT=1695047964:S=ALNI_MZC5e8mon2kgCOPwmy8suXyIFzxEg; cto_bundle=MiDme19ZaUNLcUdlY0s1RUtYMG8lMkZCdll5Nkd4QXhvZXVvaCUyRml2cHAlMkIlMkZjUExoZnJTS3lWejMxUnNmT3hwYVNWcm1uMCUyRk8wVGhqREYySjdURjVmNHZ1bnNnJTJCcVZ1JTJCeDhFSWNtV1QxQSUyQldYMVY2dGFxNWp2MldvZ2g4aTElMkZJM2pnJTJCQlBz; dakt_2_session_id=1171e864c3d2baf83d6a6e6fad954d06' -H 'Upgrade-Insecure-Requests: 1' -H 'Sec-Fetch-Dest: document' -H 'Sec-Fetch-Mode: navigate' -H 'Sec-Fetch-Site: cross-site' -H 'If-Modified-Since: Mon, 18 Sep 2023 14:22:03 GMT' -H 'TE: trailers'

it of course also works fine and returns a good looking html.

with this information i then updated the request headers of scrappy in the settings.py, which was already included above.

USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0" 

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'",
    "Accept-Language": "en-US,en;q=0.5",
    'Cookie': 'ioam2018=00014b3f0e4ceb55c65081931:1725355954229:1695029554229:.heute.at:9:at_w_atheute:RedCont/Homepage/Homepage:noevent:1695047962431:g96nzp; dakt_2_uuid=76c9c244b122d37b4bfc4089ca8207a6; dakt_2_uuid_ts=1695029555113; dakt_2_version=2.1.61; _pbjs_userid_consent_data=3524755945110770; __gads=ID=a7625cd4974c024b:T=1695029556:RT=1695047964:S=ALNI_MYz08UbrntABhw-fNYFwC0Fve4kXQ; __gpi=UID=00000c782856d0ce:T=1695029556:RT=1695047964:S=ALNI_MZC5e8mon2kgCOPwmy8suXyIFzxEg; cto_bundle=MiDme19ZaUNLcUdlY0s1RUtYMG8lMkZCdll5Nkd4QXhvZXVvaCUyRml2cHAlMkIlMkZjUExoZnJTS3lWejMxUnNmT3hwYVNWcm1uMCUyRk8wVGhqREYySjdURjVmNHZ1bnNnJTJCcVZ1JTJCeDhFSWNtV1QxQSUyQldYMVY2dGFxNWp2MldvZ2g4aTElMkZJM2pnJTJCQlBz; dakt_2_session_id=1171e864c3d2baf83d6a6e6fad954d06',
    'Upgrade-Insecure-Requests': '1',
    'Connection': 'keep-alive',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Site': 'cross-site',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-User': '?1',
    'TE': 'trailers'
}

however it stil doesnt work, only returning 403 responses...

thanks for reading this far and any help or leads are much appreciated!

0

There are 0 answers