1 import requests 2 import json 3 from requests.exceptions import ConnectionError 4 from json.decoder import JSONDecodeError 5 from lxml import etree 6 import re 7 #from config import * 8 import pymongo 9 import os 10 from hashlib import md5 11 from multiprocessing import Pool 12 13 client=pymongo.MongoClient('lacalhost') 14 db=client['toutiao1'] 15 def get_page_parse(offset,keyword): 16 data={ 17 'offset': offset, 18 'format': 'json', 19 'keyword':keyword, 20 'autoload':'true', 21 'count': 20, 22 'cur_tab': 3 23 } 24 url="https://www.toutiao.com/search_content/" 25 try: 26 response = requests.get(url,params=data) 27 if response.status_code == 200: 28 #json_data=response.json() 29 json_data=response.text 30 return json_data 31 return None 32 except ConnectionError: 33 print('Error occurred') 34 return None 35 36 def parse_page_index(text): 37 try: 38 data = json.loads(text) 39 for item in data.get('data'): 40 #print(item["article_url"]) 41 if "article_url" in item: 42 yield item.get("article_url") 43 #yield item.get('article_url','default_value') 44 except JSONDecodeError: 45 pass 46 #html=get_page_parse(0,'街拍') 47 #for url in parse_page_index(html): 48 #if len(url)>9: 49 #print(url) 50 def get_page_detail(url): 51 try: 52 response = requests.get(url) 53 if response.status_code == 200: 54 return response.text 55 return None 56 except ConnectionError: 57 print('Error occurred') 58 return None 59 60 def parse_page_details(html,url): 61 html1 = etree.HTML(html) 62 title = html1.xpath('//head/title/text()') 63 pattern = re.compile('gallery: JSON.parse\("(.*?)"\),\s+siblingList', re.S) 64 urls = re.findall(pattern, html) 65 # print(urls) 66 #print(title) 67 d = ",".join(urls) 68 s = d.replace('\\', "") 69 j = json.loads(s) 70 images_urls = [item.get('url') for item in j["sub_images"]] 71 for images_url in images_urls:download_image(images_url) 72 return { 73 'title':title, 74 'url':url, 75 'images_urls': images_urls 76 77 } 78 79 def save_to_mongo(resuit): 80 if db['toutiao1'].insert(resuit): 81 print("yes") 82 return True 83 else: 84 return False 85 86 def download_image(url): 87 print('brgain',url) 88 try: 89 response = requests.get(url) 90 if response.status_code == 200: 91 save_image(response.content) 92 return None 93 except ConnectionError: 94 print('Error occurred') 95 return None 96 97 def save_image(content): 98 file_path='{0}/{1}/{2}.{3}'.format(os.getcwd(),'pictyre',md5(content).hexdigest(),'jpg') 99 if not os.path.exists(file_path):100 with open(file_path,'wb') as f:101 f.write(content)102 f.close()103 104 105 def main(offset):106 text=get_page_parse(offset,'街拍')107 urls=parse_page_index(text)108 for url in urls:109 html=get_page_detail(url)110 parse_page_details(html,url)111 #print(result)112 #save_to_mongo(result)113 114 if __name__=='__main__':115 for i in range(1,2):116 pool=Pool()117 pool.map(main,[offset*20 for offset in range(1,2)])