Closed
Description
Hi, this error shows up frequently. I don't know how to handle it. Besides wasting memory, it's really annoying and it might be slowing down the scrape.
Here i am using multiple contexts with 1 page each but it was the same when i used multiple pages with 1 context.
I am even creating new context for each page and closing them within both parse and errback. (The code is below the error.)
I am only allowing requests for html, maybe the library is trying to handle other requests after i closed the page within parse. Though, i haven't looked into the source code so i've got no clue.
Can anyone help me?
[asyncio] ERROR: Exception in callback AsyncIOEventEmitter._emit_run.<locals>.callback(<Task finishe...been closed')>) at /usr/local/lib/python3.9/dist-packages/pyee/asyncio.py:65
handle: <Handle AsyncIOEventEmitter._emit_run.<locals>.callback(<Task finishe...been closed')>) at /usr/local/lib/python3.9/dist-packages/pyee/asyncio.py:65>
Traceback (most recent call last):
File "/usr/lib/python3.9/asyncio/events.py", line 80, in _run
self._context.run(self._callback, *self._args)
File "/usr/local/lib/python3.9/dist-packages/pyee/asyncio.py", line 71, in callback
self.emit("error", exc)
File "/usr/local/lib/python3.9/dist-packages/pyee/base.py", line 179, in emit
self._emit_handle_potential_error(event, args[0] if args else None)
File "/usr/local/lib/python3.9/dist-packages/pyee/base.py", line 139, in _emit_handle_potential_error
raise error
File "/usr/local/lib/python3.9/dist-packages/scrapy_playwright/handler.py", line 606, in _log_request
referrer = await request.header_value("referer")
File "/usr/local/lib/python3.9/dist-packages/playwright/async_api/_generated.py", line 381, in header_value
return mapping.from_maybe_impl(await self._impl_obj.header_value(name=name))
File "/usr/local/lib/python3.9/dist-packages/playwright/_impl/_network.py", line 232, in header_value
return (await self._actual_headers()).get(name)
File "/usr/local/lib/python3.9/dist-packages/playwright/_impl/_network.py", line 240, in _actual_headers
headers = await self._channel.send("rawRequestHeaders")
File "/usr/local/lib/python3.9/dist-packages/playwright/_impl/_connection.py", line 61, in send
return await self._connection.wrap_api_call(
File "/usr/local/lib/python3.9/dist-packages/playwright/_impl/_connection.py", line 461, in wrap_api_call
return await cb()
File "/usr/local/lib/python3.9/dist-packages/playwright/_impl/_connection.py", line 96, in inner_send
result = next(iter(done)).result()
playwright._impl._api_types.Error: Target page, context or browser has been closed
Here is the code :
from scrapy.crawler import CrawlerProcess
import scrapy
class SomeSpider(scrapy.Spider):
def start_requests(self):
urls = []
for i, url in enumerate(urls):
new_ctx = str(i)
proxy = {}
yield scrapy.Request(
url,
callback=self.parse,
errback=self.catch_errors,
meta={
"playwright": True ,
"playwright_include_page": True,
"playwright_context": new_ctx,
"playwright_context_kwargs": {
"java_script_enabled": False,
"ignore_https_errors": True,
**proxy
},
"playwright_page_goto_kwargs": {
"wait_until": "domcontentloaded",
"timeout": 30*1000,
},
},
)
async def parse(self, response):
page = response.meta["playwright_page"]
title = await page.title()
# await page.content()
await page.context.close()
# parse page
# response.xpath...
return
async def catch_errors(self, failure):
page = None
try:
page = failure.request.meta["playwright_page"]
await page.context.close()
except Exception as e:
pass
# handle errors
def should_abort_request(request):
ignore = True
if request.resource_type in ["document", ]:
ignore = False
return ignore
if __name__ == "__main__":
settings = {
'ROBOTSTXT_OBEY': False,
'BOT_NAME': f"",
'FEEDS': {
},
'LOG_LEVEL': 'INFO',
'RETRY_ENABLED': False,
'COOKIES_ENABLED': False,
'REDIRECT_ENABLED': True,
'CONCURRENT_REQUESTS': CONCURRENCY,
'CLOSESPIDER_TIMEOUT': time_to_run,
'CLOSESPIDER_ITEMCOUNT': 0,
'CLOSESPIDER_PAGECOUNT': 0,
'CLOSESPIDER_ERRORCOUNT': 25,
'TELNETCONSOLE_ENABLED': None,
'EXTENSIONS': {
'scrapy.extensions.closespider.CloseSpider': 100
},
# 'LOGSTATS_INTERVAL': 60*10
"DOWNLOAD_HANDLERS": {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"USER_AGENT": None, #"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
"PLAYWRIGHT_BROWSER_TYPE" : "firefox",
"PLAYWRIGHT_LAUNCH_OPTIONS": {
"headless": True,
"timeout": 100 * 1000,
},
"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 30*1000,
"PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 1,
"PLAYWRIGHT_MAX_CONTEXTS": 4,
"PLAYWRIGHT_ABORT_REQUEST": should_abort_request,
}
process = CrawlerProcess(settings)
process.crawl(SomeSpider)
process.start()
Metadata
Metadata
Assignees
Labels
No labels