微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

分析Ajax来爬取今日头条街拍美图并保存到MongDB

前提:.需要安装MongDB

注:因今日投票网页发生变更,如下代码不保证能正常使用

#!/usr/bin/env python #-*- coding: utf-8 -*-

import json import os from urllib.parse import urlencode import pymongo import requests from bs4 import BeautifulSoup from requests.exceptions import ConnectionError import re from multiprocessing import Pool from hashlib import md5 from json.decoder import JSONDecodeError MONGO_URL = localhost MONGO_DB = toutiao MONGO_TABLE = toutiao GROUP_START = 1 GROUP_END = 20 KEYWORD=街拍 client = pymongo.MongoClient(MONGO_URL,connect=False) db = client[MONGO_DB] def get_page_index(offset,keyword): data = { autoload: true,count: 20,cur_tab: 3,format: json,keyword: keyword,offset: offset,} params = urlencode(data) base = http://www.toutiao.com/search_content/ url = base + ? + params try: response = requests.get(url) if response.status_code == 200: return response.text return None except ConnectionError: print(Error occurred) return None def download_image(url): print(Downloading,url) try: response = requests.get(url) if response.status_code == 200: save_image(response.content) return None except ConnectionError: return None def save_image(content): file_path = {0}/{1}.{2}.format(os.getcwd(),md5(content).hexdigest(),jpg) print(file_path) if not os.path.exists(file_path): with open(file_path,wb) as f: f.write(content) f.close() def parse_page_index(text): try: data = json.loads(text) if data and data in data.keys(): for item in data.get(data): yield item.get(article_url) except JSONDecodeError: pass


def get_page_detail(url): try: response = requests.get(url) if response.status_code == 200: return response.text return None except ConnectionError: print(Error occurred) return None def parse_page_detail(html,url): soup = BeautifulSoup(html,lxml) result = soup.select(title) title = result[0].get_text() if result else ‘‘ images_pattern = re.compile(gallery: JSON.parse\("(.*)"\),re.S) result = re.search(images_pattern,html) if result: data = json.loads(result.group(1).replace(\\,‘‘)) if data and sub_images in data.keys(): sub_images = data.get(sub_images) images = [item.get(url) for item in sub_images] for image in images: download_image(image) return { title: title,url: url,images: images } def save_to_mongo(result): if db[MONGO_TABLE].insert(result): print(Successfully Saved to Mongo,result) return True return False def main(offset): text = get_page_index(offset,KEYWORD) urls = parse_page_index(text) for url in urls: html = get_page_detail(url) print(html) result = parse_page_detail(html,url) print(result) if result: save_to_mongo(result) if __name__ == __main__: pool = Pool() groups = ([x * 20 for x in range(GROUP_START,GROUP_END + 1)]) pool.map(main,groups) pool.close() pool.join()

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 [email protected] 举报,一经查实,本站将立刻删除。

相关推荐