Python scrapy.signals 模块,spider_idle() 实例源码
我们从Python开源项目中,提取了以下29个代码示例,用于说明如何使用scrapy.signals.spider_idle()。
def setup_redis(self):
"""Setup redis connection and idle signal.
This should be called after the spider has set its crawler object.
"""
if not self.redis_key:
self.redis_key = '%s:start_urls' % self.name
self.server = connection.from_settings(self.crawler.settings)
# idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from redis queue
self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
self.log("Reading URLs from redis list '%s'" % self.redis_key)
def spider_idle(self):
"""Schedules a request if available,otherwise waits."""
self.schedule_next_request()
raise DontCloseSpider
def setup_rabbitmq(self):
""" Setup RabbitMQ connection.
Call this method after spider has set its crawler object.
:return: None
"""
if not self.rabbitmq_key:
self.rabbitmq_key = '{}:start_urls'.format(self.name)
self.server, self.redis_server = connection.from_settings(self.crawler.settings)
self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
def spider_idle(self):
""" Waits for request to be scheduled.
:return: None
"""
self.crawler.engine.slot.scheduler.next_request()
raise DontCloseSpider
def dm_setup(self):
""" Set method for spider idle state.
It's implemented this way to support one and many instances of the mixin.
"""
dispatcher.connect(
self.dequeue_next_page_requests,
signal=signals.spider_idle
)
self._was_setup_called = True
def spider_idle(self):
self.log("Spider idle signal caught.")
raise DontCloseSpider
def setup_redis(self):
"""Setup redis connection and idle signal.
This should be called after the spider has set its crawler object.
"""
if not self.redis_key:
self.redis_key = '%s:start_urls' % self.name
self.server = connection.from_settings(self.crawler.settings)
# idle signal is called when the spider has no requests left, signal=signals.item_scraped)
self.log("Reading URLs from redis list '%s'" % self.redis_key)
def spider_idle(self):
"""Schedules a request if available,otherwise waits."""
self.schedule_next_request()
raise DontCloseSpider
def spider_idle(self):
"""Schedules a request if available,otherwise waits."""
# XXX: Handle a sentinel to close the spider.
self.schedule_next_requests()
raise DontCloseSpider
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.engine_started, signals.engine_started) # ????
crawler.signals.connect(pipeline.engine_stopped, signals.engine_stopped) # ????
crawler.signals.connect(pipeline.item_scraped, signals.item_scraped) # ??????????
crawler.signals.connect(pipeline.item_dropped, signals.item_dropped) # ??????????
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) # ????????????
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) # ????????????
crawler.signals.connect(pipeline.spider_idle, signals.spider_idle) # ????
crawler.signals.connect(pipeline.spider_error, signals.spider_error) # ????
crawler.signals.connect(pipeline.request_scheduled, signals.request_scheduled) # ??????
crawler.signals.connect(pipeline.request_dropped, signals.request_dropped) # ??????
crawler.signals.connect(pipeline.response_received, signals.response_received) # ????
crawler.signals.connect(pipeline.response_downloaded, signals.response_downloaded) # ????
return pipeline
def _set_crawler(self, crawler):
super(StructureSpider, self)._set_crawler(crawler)
self.crawler.signals.connect(self.spider_idle,
signal=signals.spider_idle)
def spider_idle(self):
if self.settings.getbool("IDLE", True):
print('Don\'t close spider......')
raise DontCloseSpider
def set_crawler(self, crawler):
super(RedisSpider, self).set_crawler(crawler)
self.crawler.signals.connect(self.spider_idle,
signal=signals.spider_idle)
def spider_idle(self):
"""Schedules a request if available,otherwise waits."""
# XXX: Handle a sentinel to close the spider.
self.schedule_next_requests()
raise DontCloseSpider
def spider_idle(self):
"""Schedules a request if available,otherwise waits."""
# XXX: Handle a sentinel to close the spider.
self.schedule_next_requests()
raise DontCloseSpider
def spider_idle(self):
"""Schedules a request if available,otherwise waits."""
# XXX: Handle a sentinel to close the spider.
self.schedule_next_requests()
raise DontCloseSpider
def spider_idle(self):
"""Schedules a request if available,otherwise waits."""
# XXX: Handle a sentinel to close the spider.
self.schedule_next_requests()
raise DontCloseSpider
def spider_idle(self):
"""Schedules a request if available,otherwise waits."""
# XXX: Handle a sentinel to close the spider.
self.schedule_next_requests()
raise DontCloseSpider
def setup_redis(self, crawler=None):
"""Setup redis connection and idle signal.
This should be called after the spider has set its crawler object.
"""
if self.server is not None:
return
if crawler is None:
# We allow optional crawler argument to keep backwards
# compatibility.
# XXX: Raise a deprecation warning.
crawler = getattr(self, 'crawler', None)
if crawler is None:
raise ValueError("crawler is required")
settings = crawler.settings
if self.redis_key is None:
self.redis_key = settings.get(
'REdis_START_URLS_KEY', DEFAULT_START_URLS_KEY,
)
self.redis_key = self.redis_key % {'name': self.name}
if not self.redis_key.strip():
raise ValueError("redis_key must not be empty")
if self.redis_batch_size is None:
self.redis_batch_size = settings.getint(
'REdis_START_URLS_BATCH_SIZE', DEFAULT_START_URLS_BATCH_SIZE,
)
try:
self.redis_batch_size = int(self.redis_batch_size)
except (TypeError, ValueError):
raise ValueError("redis_batch_size must be an integer")
self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
"(batch size: %(redis_batch_size)s)", self.__dict__)
self.server = connection.from_settings(crawler.settings)
# The idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from redis queue
crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
def setup_redis(self, defaults.START_URLS_KEY,
)
self.redis_key = self.redis_key % {'name': self.name}
if not self.redis_key.strip():
raise ValueError("redis_key must not be empty")
if self.redis_batch_size is None:
# Todo: Deprecate this setting (REdis_START_URLS_BATCH_SIZE).
self.redis_batch_size = settings.getint(
'REdis_START_URLS_BATCH_SIZE',
settings.getint('CONCURRENT_REQUESTS'), ValueError):
raise ValueError("redis_batch_size must be an integer")
if self.redis_encoding is None:
self.redis_encoding = settings.get('REdis_ENCODING', defaults.REdis_ENCODING)
self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
"(batch size: %(redis_batch_size)s,encoding: %(redis_encoding)s",
self.__dict__)
self.server = connection.from_settings(crawler.settings)
# The idle signal is called when the spider has no requests left, signal=signals.spider_idle)
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 [email protected] 举报,一经查实,本站将立刻删除。