Python scrapy.signals 模块,spider_closed() 实例源码
我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scrapy.signals.spider_closed()。
def spider_closed(self, spider):
"""Handle the spider_closed event to save the map"""
# create the special marker for all the ads without geocode
print "found %d items without geocode" % (len(self.no_geocode))
if len(self.no_geocode) > 0:
html = ""
for x in self.no_geocode:
html += "<a href=%s target=_blank>%s</a> : %s<br>" % (x["url"], x["title"], x["price"])
iframe = folium.element.IFrame(html=html, width=500, height=100)
popup = folium.Popup(iframe, max_width=500)
folium.Marker(MAP_LATLNG,
popup=popup,
icon=folium.Icon()).add_to(self.m_map)
print "found %d new items" % (self.new_items)
pickle.dump(self.m_list, open(DATABASE, 'wb'))
self.m_map.save('map.html')
def __init__(self,rule):
dispatcher.connect(self.spider_opened, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
self.rule = rule
self.name = rule.name
self.allowed_domains = rule.allowed_domains.split(',')
self.start_urls = rule.start_urls.split(',')
rule_list = []
# ??`???`???
if len(rule.next_page):
rule_list.append(Rule(LinkExtractor(restrict_xpaths=rule.next_page), follow=True))
rule_list.append(Rule(LinkExtractor(
allow=rule.allow_url.split(','),
unique=True),
follow=True,
callback='parse_item'))
self.rules = tuple(rule_list)
super(ProxySpiderSpider, self).__init__()
def from_crawler(cls, crawler):
try:
output_path = (
crawler.settings.get('FeedS_CONfig')['Feeds']['output_path']
)
except (KeyError, TypeError):
output_path = 'output'
try:
output_url = (
crawler.settings.get('FeedS_CONfig')['Feeds']['output_url']
)
except (KeyError, TypeError):
output_url = None
pipeline = cls(output_path=output_path, output_url=output_url)
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def process_item(self, item, spider):
if isinstance(item, AirbnbItem):
self.room_count += 1
if self.room_count > 100000:
self.room_count = 0
self.room_file_count += 1
self.spider_closed(spider, mode=1)
self.spider_opened(spider, mode=1)
self.exporter_room.export_item(item)
elif isinstance(item, UserItem):
self.user_count += 1
if self.user_count > 100000:
self.user_count = 0
self.user_file_count += 1
self.spider_closed(spider, mode=2)
self.spider_opened(spider, mode=2)
self.exporter_user.export_item(item)
else:
logger.info('Some error happened!')
def run_spider():
settings = Settings()
settings.set('ITEM_PIPELInes', {
'__main__.JsonWriterPipeline': 100
})
# enable remote sever certificate verification
# see http://doc.scrapy.org/en/latest/topics/settings.html#downloader-clientcontextfactory
settings.set('DOWNLOADER_CLIENTCONTEXTFACTORY',
'scrapy.core.downloader.contextfactory.browserLikeContextFactory'
)
# uncomment below line to enable the logging for debug
# configure_logging()
crawler = Crawler(JenkinsJobSpider, settings)
crawler.signals.connect(callback, signal=signals.spider_closed)
crawler.crawl()
reactor.run()
def spider_closed(spider):
spider_stats[spider.name] = {
'finish_reason': spider.crawler.stats.get_value('finish_reason'),
'duration': (
spider.crawler.stats.get_value('finish_time') -
spider.crawler.stats.get_value('start_time')).total_seconds(),
'item_scraped_count':
spider.crawler.stats.get_value('item_scraped_count'),
}
print("Spider %s closed (%s) after %0.1f sec,%d items" % (
spider.name,
spider.crawler.stats.get_value('finish_reason'),
(spider.crawler.stats.get_value('finish_time') -
spider.crawler.stats.get_value('start_time')).total_seconds(),
spider.crawler.stats.get_value('item_scraped_count') or 0,
))
def __init__(self):
dispatcher.connect(self.spider_opended, signals.spider_closed)
dispatcher.connect(self.engine_stopped, signals.engine_stopped)
dispatcher.connect(self.engine_started, signals.engine_started)
# ????????????scrapy_site??????
self.curpath = os.getcwd()
#?????????????
self.spidername_filepath = self.curpath + "/scrapy_site/msg/"
# ?????keyword.conf????????
self.keywordsDict = dict()
self.getKeywords()
#????????????
self.webnamesDict = dict()
self.getWebnames()
# ????
self.msgDict = dict()
SavePipeline.initCount = SavePipeline.initCount + 1
def __init__(self, settings):
self.options = settings.get('PHANTOMJS_OPTIONS', {}) # ???
max_run = settings.get('PHANTOMJS_MAXRUN', 10) # PhantomJS ???????????,??10
self.sem = defer.DeferredSemaphore(max_run)
self.queue = Queue.LifoQueue(maxsize=max_run) # LifoQueue ??????
SignalManager(dispatcher.Any).connect(receiver=self._close, signal=signals.spider_closed)
def __init__(self, crawler):
self.crawler = crawler
self.initiatives = 0
self.amendments = 0
self.finishtext = 0
self.responses = 0
self.members = 0
# connect the extension object to signals
crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
def spider_closed(self, spider):
self.crawler.stats.set_value('item/initiatives', self.initiatives)
self.crawler.stats.set_value('item/amendments', self.amendments)
self.crawler.stats.set_value('item/finishtext', self.finishtext)
self.crawler.stats.set_value('item/responses', self.responses)
self.crawler.stats.set_value('item/members', self.responses)
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_closed)
return pipeline
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def __init__(self, *a, **kw):
"""Attach a callback to the spider_closed signal"""
super(Kijiji, self).__init__(*a, **kw)
dispatcher.connect(self.spider_closed, signals.spider_closed)
if USE_DB is True:
self.open_database()
if DRAW_ALL_DB is True and DRAW_NEW_AD_ONLY is False:
# add already kNow marker
for x in self.m_list:
self.add_marker(x, False)
def spider_closed(self, spider):
self.log('spider_closed ???????????????')
def from_crawler(cls, signals.spider_closed)
return pipeline
def spider_closed(self, spider):
for exporter in self.exporters.values():
exporter.finish_exporting()
for file in self.files:
file.close()
def from_crawler(cls, crawler):
ext = cls(crawler.stats)
crawler.signals.connect(ext.spider_opened,
signal=signals.spider_opened)
crawler.signals.connect(ext.spider_closed,
signal=signals.spider_closed)
crawler.signals.connect(ext.item_scraped,
signal=signals.item_scraped)
return ext
def spider_closed(self, spider):
value = self.stats.get_value('item_scraped_count',
0)
save_stats(spider.settings['SPIDER_STATS_URL'],
spider._id,
value)
if spider.settings['BOT_NAME'] != 'TestSpider':
logger.info('spider[%s] crawled %d articles',
spider.name,
value)
if value == 0:
update_spider_stats(spider,
{'fail': 1})
def from_crawler(cls, signals.spider_closed)
return pipeline
def spider_closed(self, spider):
file = self.files.pop(spider.name)
file.close()
pass
def spider_closed(self, spider):
self.file.seek(-2, os.SEEK_END)
self.file.truncate()
self.file.write(']')
self.file.close()
def from_crawler(cls, crawler):
m = cls()
if not crawler.settings.getbool('SELENIUM_ENABLED'):
raise NotConfigured()
crawler.signals.connect(m.spider_closed, signal=signals.spider_closed)
return m
def spider_closed(self, spider):
self.driver.close()
def from_crawler(cls, signals.spider_closed)
return pipeline
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) # ????????????
return pipeline
def spider_closed(self, spider, reason):
"""
????????????
:param spider:
:param reason: finished/cancelled/shutdown
:return:
"""
print time.strftime("%Y-%m-%d %H:%M:%s"), 'StatsPipeline Signals: spider_closed'
print spider.crawler.stats.get_stats()
print spider.crawler.stats.get_value('downloader/request_count', 0) # ????
print spider.crawler.stats.get_value('downloader/response_count', 0) # ????
print spider.crawler.stats.get_value('response_received_count', 0) # ??????
print spider.crawler.stats.get_value('item_dropped_count', 0) # ??????
print spider.crawler.stats.get_value('item_scraped_count', 0) # ??????
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.engine_started, signals.engine_started) # ????
crawler.signals.connect(pipeline.engine_stopped, signals.engine_stopped) # ????
crawler.signals.connect(pipeline.item_scraped, signals.item_scraped) # ??????????
crawler.signals.connect(pipeline.item_dropped, signals.item_dropped) # ??????????
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) # ????????????
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) # ????????????
crawler.signals.connect(pipeline.spider_idle, signals.spider_idle) # ????
crawler.signals.connect(pipeline.spider_error, signals.spider_error) # ????
crawler.signals.connect(pipeline.request_scheduled, signals.request_scheduled) # ??????
crawler.signals.connect(pipeline.request_dropped, signals.request_dropped) # ??????
crawler.signals.connect(pipeline.response_received, signals.response_received) # ????
crawler.signals.connect(pipeline.response_downloaded, signals.response_downloaded) # ????
return pipeline
def from_crawler(cls, signals.spider_closed)
return pipeline
def spider_closed(self, spider):
self.exporter.finish_exporting()
file_csv = self.files.pop(spider)
file_csv.close()
def from_crawler(cls, signals.spider_closed)
return pipeline
def spider_closed(self, spider):
self.exporter.finish_exporting()
file_xml = self.files.pop(spider)
file_xml.close()
def from_crawler(cls, signals.spider_closed)
return pipeline
def __init__(self, *args, **kwargs):
super(FullDomainSpider, self).__init__(*args, **kwargs)
self.allowed_domains = kwargs.get('allowed_domains').split(',')
self.org = kwargs.get('org')
self.start_urls = kwargs.get('start_urls').split(',')
dispatcher.connect(self.spider_opened, signals.spider_closed)
def spider_closed(self, spider):
try:
self.conn.close()
except:
los.msg("Could not close database connection", level=log.ERROR)
def spider_closed(self, spider):
self.exporters[spider].finish_exporting()
file = self.files.pop(spider)
file.close()
def __exit__(self, exc_type, exc_val, exc_tb):
responses = self.crawler.signals.send_catch_log(signal=signals.spider_closed,
spider=self.spider, reason=None)
for _, failure in responses:
if failure:
failure.raiseException()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, spider):
self.count += 1
if self.count > 1000:
self.count = 0
self.file_count += 1
self.spider_closed()
self.spider_opened()
self.exporter.export_item(item)
return item
def from_crawler(cls, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 [email protected] 举报,一经查实,本站将立刻删除。