博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
动态网站爬取实例
阅读量:5154 次
发布时间:2019-06-13

本文共 3558 字,大约阅读时间需要 11 分钟。

1 import requests  2 import json  3 from requests.exceptions import ConnectionError  4 from json.decoder import JSONDecodeError  5 from lxml import etree  6 import re  7 #from config import *  8 import pymongo  9 import os 10 from hashlib import md5 11 from multiprocessing import Pool 12  13 client=pymongo.MongoClient('lacalhost') 14 db=client['toutiao1'] 15 def get_page_parse(offset,keyword): 16     data={ 17         'offset': offset, 18         'format': 'json', 19         'keyword':keyword, 20         'autoload':'true', 21         'count': 20, 22         'cur_tab': 3 23     } 24     url="https://www.toutiao.com/search_content/" 25     try: 26         response = requests.get(url,params=data) 27         if response.status_code == 200: 28             #json_data=response.json() 29             json_data=response.text 30             return json_data 31         return None 32     except ConnectionError: 33             print('Error occurred') 34             return None 35  36 def parse_page_index(text): 37     try: 38         data = json.loads(text) 39         for item in data.get('data'): 40             #print(item["article_url"]) 41             if "article_url" in item: 42                 yield item.get("article_url") 43             #yield item.get('article_url','default_value') 44     except JSONDecodeError: 45         pass 46 #html=get_page_parse(0,'街拍') 47 #for url in parse_page_index(html): 48     #if len(url)>9: 49         #print(url) 50 def get_page_detail(url): 51     try: 52         response = requests.get(url) 53         if response.status_code == 200: 54             return response.text 55         return None 56     except ConnectionError: 57         print('Error occurred') 58         return None 59  60 def parse_page_details(html,url): 61     html1 = etree.HTML(html) 62     title = html1.xpath('//head/title/text()') 63     pattern = re.compile('gallery: JSON.parse\("(.*?)"\),\s+siblingList', re.S) 64     urls = re.findall(pattern, html) 65     # print(urls) 66     #print(title) 67     d = ",".join(urls) 68     s = d.replace('\\', "") 69     j = json.loads(s) 70     images_urls = [item.get('url') for item in j["sub_images"]] 71     for images_url in images_urls:download_image(images_url) 72     return { 73         'title':title, 74         'url':url, 75         'images_urls': images_urls 76  77     } 78  79 def save_to_mongo(resuit): 80     if db['toutiao1'].insert(resuit): 81         print("yes") 82         return True 83     else: 84         return False 85  86 def download_image(url): 87     print('brgain',url) 88     try: 89         response = requests.get(url) 90         if response.status_code == 200: 91             save_image(response.content) 92         return None 93     except ConnectionError: 94         print('Error occurred') 95         return None 96  97 def save_image(content): 98     file_path='{0}/{1}/{2}.{3}'.format(os.getcwd(),'pictyre',md5(content).hexdigest(),'jpg') 99     if not os.path.exists(file_path):100         with open(file_path,'wb') as f:101             f.write(content)102             f.close()103 104 105 def main(offset):106     text=get_page_parse(offset,'街拍')107     urls=parse_page_index(text)108     for url in urls:109         html=get_page_detail(url)110         parse_page_details(html,url)111         #print(result)112         #save_to_mongo(result)113 114 if __name__=='__main__':115     for i in range(1,2):116         pool=Pool()117         pool.map(main,[offset*20 for offset in range(1,2)])

 

转载于:https://www.cnblogs.com/realmonkeykingsun/p/7966284.html

你可能感兴趣的文章
UIPikerView的属性和使用方法
查看>>
解决idea创建ssm项目找不到mybatis的mapper的xml文件问题
查看>>
url里面的参数不能带特殊字符
查看>>
C#泛型编程基础知识总结【转】
查看>>
maven工程分开common和server
查看>>
洛谷 P2701 [USACO5.3]巨大的牛棚Big Barn
查看>>
虚基类
查看>>
背包的硬币问题
查看>>
linux查看CPU高速缓存(cache)信息
查看>>
VSFLEXgrid控件几个特殊的属性方法的使用
查看>>
GOIP connects with Elastix through “config by line”
查看>>
修改Win7远程桌面端口
查看>>
关于多线程的死锁
查看>>
2018-2019-1 20165205 20165233 实验二 固件程序设计
查看>>
APUE 学习笔记(三) 文件和目录
查看>>
APUE 学习笔记(十) 高级I/O
查看>>
Android Monkey压力测试
查看>>
BZOJ1878: [SDOI2009]HH的项链
查看>>
关于程序、进程和线程
查看>>
android登陆自动调整代码
查看>>