自学Python五天,学习了Python爬虫
首先安装Python,下载Python最新版。安装requests_html库,详细使用方法GitHub
安装requests_html库
pip install requests_html
使用了多线程,每个页面建立一个文件夹
完整代码如下
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
from requests_html import HTMLSession import requests import time import datetime import sys import threading import re import os lock = threading.Lock() img_title="" count=0 def Handler(start, end, urls,path_name): global count for url in urls[start:end]: header={"Referer":"https://www.mzitu.com/"} html = requests.get(url,timeout=10,headers=header) path="D:/TS/img/{0}/{1}.jpg".format(path_name,datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')) with open(path, 'wb') as file: file.write(html.content) lock.acquire() count =count+1 lock.release() #print("下载进度:%.2f" % (count/len(urls))) num=len(urls) done = int(50 * count / num)#20为进度条的长度 # 调用标准输出刷新命令行,看到\r回车符了吧 # 相当于把每一行重新刷新一遍 sys.stdout.write("\r下载进度:[%s%s] %d%%" % ('█' * done, ' ' * (50 - done), 100 * count / num)) sys.stdout.flush() def download_file(urls_list,path_name,num_thread = 5): ''' s_list = [] session = HTMLSession() myheader={ 'cookie':'tt_webid=6705276193249183240; __tea_sdk__ssid=undefined; passport_auth_status=4a9f2962712b3b506165773ad2e6306c; sso_auth_status=837c704700f69a2fde9c4355fa9590a6; sso_uid_tt=59274967267a0a837da1e7079c1481f9; toutiao_sso_user=0eac5ef2472d1a0761cb1823053bf5c7; sessionid=213fa2b2b530ccebccdce18228e7e8f5; _mp_test_key_1=a799564a27129c71b94096eabf84f280; uid_tt=ea22a09bb3ec13fcc29415915afafef1; _ba=BA0.2-20191003-5110e-Ot9q1XxXYtSpyTFCYg6H; currentMediaId=6142685756' ,'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36' } r = session.get(url,headers=myheader) img_title=r.html.find('h4')[0].text mkdir("D:\\TS\\img\\{}\\".format(img_title)) titles = r.html.find('[data-src]') for title in titles: s_list.append(title.attrs['data-src']) ''' file_size = len(urls_list) # 启动多线程写文件 part = file_size // num_thread # 如果不能整除,最后一块应该多几个字节 for i in range(num_thread): start = part * i if i == num_thread - 1: # 最后一块 end = file_size else: end = start + part t = threading.Thread(target=Handler, kwargs={'start': start, 'end': end, 'urls': urls_list,'path_name':path_name}) t.setDaemon(True) t.start() # 等待所有线程下载完成 main_thread = threading.current_thread() for t in threading.enumerate(): if t is main_thread: continue t.join() print() def mkdir(path): # 引入模块 import os # 去除首位空格 path=path.strip() # 去除尾部 \ 符号 path=path.rstrip("\\") # 判断路径是否存在 # 存在 True # 不存在 False isExists=os.path.exists(path) # 判断结果 if not isExists: # 如果不存在则创建目录 # 创建目录操作函数 os.makedirs(path) print (path+' 创建成功') return True else: # 如果目录存在则不创建,并提示目录已存在 print (path+' 目录已存在') return False def down_t66y(url): session = HTMLSession() myheader={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'} r = session.get(url,headers=myheader) links = r.html.find('h3 a') for link in links: if("套" in link.text): global count count=0 #download_file("https://cc.vttg.pw/"+link.attrs["href"]) #print(link.attrs["href"]) def down_mzt(url): session = HTMLSession() myheader={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',"Referer":"https://www.mzitu.com/","accept-encoding": "gzip"} r = session.get(url,headers=myheader) pages = re.findall(r'<span>\d{1,2}',r.text) num=(int)(pages[-1][6:len(pages[-1])]) title = r.html.find('h2[class=main-title]')[0].text path_name="D:\\TS\\img\\{}\\".format(title) mkdir(path_name) links = r.html.find('div[class=main-image] img')[0] filepath,fullflname = os.path.split(links.attrs["src"]) fname,ext= os.path.splitext(fullflname) #filepath为文件的目录,即E:/tt #fullflname为文件名的全名,即abc.py #fname为文件的名字,即abc #ext为文件的扩展名,即.py #print(filepath) #print(fname[0:3]) urls_list=[] for i in range(1,num+1): urls_list.append("{}/{}{:0>2d}.jpg".format(filepath,fname[0:3],i)) global count count=0 download_file(urls_list,title) def get_mztrl(url): session = HTMLSession() myheader={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',"Referer":"https://www.mzitu.com/","accept-encoding": "gzip"} r = session.get(url,headers=myheader) pageurls=r.html.find("[class=postlist] li span a") for pageurl in pageurls: down_mzt(pageurl.attrs['href']) if __name__ == '__main__': start = datetime.datetime.now().replace(microsecond=0) for i in range(1,2): get_mztrl("https://www.mzitu.com/page/{}/".format(i)) #down_mzt("https://www.mzitu.com/194229") end = datetime.datetime.now().replace(microsecond=0) print("下载完成,用时: ", end='') print(end-start) |
倒数第6行for i in range(1,2),修改数字改变起始页和结束页,下载图片保存在d:/TS/img文件夹中(自动创建文件夹)
近期评论