Skip to content

Need to handle Error: "Target page, context or browser has been closed" #191

Closed
@munkhbato

Description

@munkhbato

Hi, this error shows up frequently. I don't know how to handle it. Besides wasting memory, it's really annoying and it might be slowing down the scrape.
Here i am using multiple contexts with 1 page each but it was the same when i used multiple pages with 1 context.
I am even creating new context for each page and closing them within both parse and errback. (The code is below the error.)
I am only allowing requests for html, maybe the library is trying to handle other requests after i closed the page within parse. Though, i haven't looked into the source code so i've got no clue.
Can anyone help me?

[asyncio] ERROR: Exception in callback AsyncIOEventEmitter._emit_run.<locals>.callback(<Task finishe...been closed')>) at /usr/local/lib/python3.9/dist-packages/pyee/asyncio.py:65
handle: <Handle AsyncIOEventEmitter._emit_run.<locals>.callback(<Task finishe...been closed')>) at /usr/local/lib/python3.9/dist-packages/pyee/asyncio.py:65>
Traceback (most recent call last):
  File "/usr/lib/python3.9/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/usr/local/lib/python3.9/dist-packages/pyee/asyncio.py", line 71, in callback
    self.emit("error", exc)
  File "/usr/local/lib/python3.9/dist-packages/pyee/base.py", line 179, in emit
    self._emit_handle_potential_error(event, args[0] if args else None)
  File "/usr/local/lib/python3.9/dist-packages/pyee/base.py", line 139, in _emit_handle_potential_error
    raise error
  File "/usr/local/lib/python3.9/dist-packages/scrapy_playwright/handler.py", line 606, in _log_request
    referrer = await request.header_value("referer")
  File "/usr/local/lib/python3.9/dist-packages/playwright/async_api/_generated.py", line 381, in header_value
    return mapping.from_maybe_impl(await self._impl_obj.header_value(name=name))
  File "/usr/local/lib/python3.9/dist-packages/playwright/_impl/_network.py", line 232, in header_value
    return (await self._actual_headers()).get(name)
  File "/usr/local/lib/python3.9/dist-packages/playwright/_impl/_network.py", line 240, in _actual_headers
    headers = await self._channel.send("rawRequestHeaders")
  File "/usr/local/lib/python3.9/dist-packages/playwright/_impl/_connection.py", line 61, in send
    return await self._connection.wrap_api_call(
  File "/usr/local/lib/python3.9/dist-packages/playwright/_impl/_connection.py", line 461, in wrap_api_call
    return await cb()
  File "/usr/local/lib/python3.9/dist-packages/playwright/_impl/_connection.py", line 96, in inner_send
    result = next(iter(done)).result()
playwright._impl._api_types.Error: Target page, context or browser has been closed

Here is the code :

from scrapy.crawler import CrawlerProcess
import scrapy

class SomeSpider(scrapy.Spider):
        
    def start_requests(self):
        urls = []
        for i, url in enumerate(urls):
            new_ctx = str(i)
            proxy = {}
            yield scrapy.Request(
                url, 
                callback=self.parse, 
                errback=self.catch_errors,
                meta={
                    "playwright": True ,
                    "playwright_include_page": True,
                    "playwright_context": new_ctx,
                    "playwright_context_kwargs": {
                        "java_script_enabled": False,
                        "ignore_https_errors": True,
                        **proxy
                    },  
                    "playwright_page_goto_kwargs": {
                        "wait_until": "domcontentloaded",
                        "timeout": 30*1000,
                    },
                },
            )

    async def parse(self, response):
        page = response.meta["playwright_page"]
        title = await page.title()
        # await page.content()
        await page.context.close()

        # parse page
        # response.xpath...
        
        return 
    
        
    async def catch_errors(self, failure):
        page = None
        try:
            page = failure.request.meta["playwright_page"]
            await page.context.close()
        except Exception as e:
            pass

        # handle errors



def should_abort_request(request):
    ignore = True
    if request.resource_type in ["document", ]:
        ignore = False
    return ignore

        

if __name__ == "__main__":
    
    settings = {
        'ROBOTSTXT_OBEY': False,
        'BOT_NAME': f"",
        'FEEDS': {
        },
        'LOG_LEVEL': 'INFO',
        'RETRY_ENABLED': False,
        'COOKIES_ENABLED': False,
        'REDIRECT_ENABLED': True,
        'CONCURRENT_REQUESTS': CONCURRENCY,
        'CLOSESPIDER_TIMEOUT': time_to_run,
        'CLOSESPIDER_ITEMCOUNT': 0,
        'CLOSESPIDER_PAGECOUNT': 0,
        'CLOSESPIDER_ERRORCOUNT': 25,
        'TELNETCONSOLE_ENABLED': None,
        'EXTENSIONS': {
            'scrapy.extensions.closespider.CloseSpider': 100
        },
        # 'LOGSTATS_INTERVAL': 60*10

        "DOWNLOAD_HANDLERS": {
            "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        },
        "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
        "USER_AGENT": None, #"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
        "PLAYWRIGHT_BROWSER_TYPE" : "firefox",
        "PLAYWRIGHT_LAUNCH_OPTIONS": {
            "headless": True,
            "timeout": 100 * 1000, 
        },
        "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 30*1000,
        "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 1,
        "PLAYWRIGHT_MAX_CONTEXTS": 4,
        "PLAYWRIGHT_ABORT_REQUEST": should_abort_request,
    }

    process = CrawlerProcess(settings)

    process.crawl(SomeSpider)
    process.start()


Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions