Python妹子图多线程爬虫源码

自学Python五天，学习了Python爬虫

首先安装Python,下载Python最新版。安装requests_html库，详细使用方法GitHub

安装requests_html库
pip install requests_html

使用了多线程，每个页面建立一个文件夹

完整代码如下

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

from requests_html import HTMLSession

import requests

import time

import datetime

import sys

import threading

import re

import os

lock = threading.Lock()

img_title=""

count=0

def Handler(start, end, urls,path_name):

global count

for url in urls[start:end]:

header={"Referer":"https://www.mzitu.com/"}

html = requests.get(url,timeout=10,headers=header)

path="D:/TS/img/{0}/{1}.jpg".format(path_name,datetime.datetime.now().strftime('%Y%m%d%H%M%S%f'))

with open(path, 'wb') as file:

file.write(html.content)

lock.acquire()

count =count+1

lock.release()

#print("下载进度：%.2f" % (count/len(urls)))

num=len(urls)

done = int(50 * count / num)#20为进度条的长度

# 调用标准输出刷新命令行，看到\r回车符了吧

# 相当于把每一行重新刷新一遍

sys.stdout.write("\r下载进度：[%s%s] %d%%" % ('█' * done, ' ' * (50 - done), 100 * count / num))

sys.stdout.flush()

def download_file(urls_list,path_name,num_thread = 5):

'''

s_list = []

session = HTMLSession()

myheader={

'cookie':'tt_webid=6705276193249183240; __tea_sdk__ssid=undefined; passport_auth_status=4a9f2962712b3b506165773ad2e6306c; sso_auth_status=837c704700f69a2fde9c4355fa9590a6; sso_uid_tt=59274967267a0a837da1e7079c1481f9; toutiao_sso_user=0eac5ef2472d1a0761cb1823053bf5c7; sessionid=213fa2b2b530ccebccdce18228e7e8f5; _mp_test_key_1=a799564a27129c71b94096eabf84f280; uid_tt=ea22a09bb3ec13fcc29415915afafef1; _ba=BA0.2-20191003-5110e-Ot9q1XxXYtSpyTFCYg6H; currentMediaId=6142685756'

,'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'

}

r = session.get(url,headers=myheader)

img_title=r.html.find('h4')[0].text

mkdir("D:\\TS\\img\\{}\\".format(img_title))

titles = r.html.find('[data-src]')

for title in titles:

s_list.append(title.attrs['data-src'])

'''

file_size = len(urls_list)

# 启动多线程写文件

part = file_size // num_thread # 如果不能整除，最后一块应该多几个字节

for i in range(num_thread):

start = part * i

if i == num_thread - 1: # 最后一块

end = file_size

else:

end = start + part

t = threading.Thread(target=Handler, kwargs={'start': start, 'end': end, 'urls': urls_list,'path_name':path_name})

t.setDaemon(True)

t.start()

# 等待所有线程下载完成

main_thread = threading.current_thread()

for t in threading.enumerate():

if t is main_thread:

continue

t.join()

print()

def mkdir(path):

# 引入模块

import os

# 去除首位空格

path=path.strip()

# 去除尾部 \ 符号

path=path.rstrip("\\")

# 判断路径是否存在

# 存在 True

# 不存在 False

isExists=os.path.exists(path)

# 判断结果

if not isExists:

# 如果不存在则创建目录

# 创建目录操作函数

os.makedirs(path)

print (path+' 创建成功')

return True

else:

# 如果目录存在则不创建，并提示目录已存在

print (path+' 目录已存在')

return False

def down_t66y(url):

session = HTMLSession()

myheader={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}

r = session.get(url,headers=myheader)

links = r.html.find('h3 a')

for link in links:

if("套" in link.text):

global count

count=0

#download_file("https://cc.vttg.pw/"+link.attrs["href"])

#print(link.attrs["href"])

def down_mzt(url):

session = HTMLSession()

myheader={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',"Referer":"https://www.mzitu.com/","accept-encoding": "gzip"}

r = session.get(url,headers=myheader)

pages = re.findall(r'<span>\d{1,2}',r.text)

num=(int)(pages[-1][6:len(pages[-1])])

title = r.html.find('h2[class=main-title]')[0].text

path_name="D:\\TS\\img\\{}\\".format(title)

mkdir(path_name)

links = r.html.find('div[class=main-image] img')[0]

filepath,fullflname = os.path.split(links.attrs["src"])

fname,ext= os.path.splitext(fullflname)

#filepath为文件的目录,即E:/tt

#fullflname为文件名的全名，即abc.py

#fname为文件的名字,即abc

#ext为文件的扩展名,即.py

#print(filepath)

#print(fname[0:3])

urls_list=[]

for i in range(1,num+1):

urls_list.append("{}/{}{:0>2d}.jpg".format(filepath,fname[0:3],i))

global count

count=0

download_file(urls_list,title)

def get_mztrl(url):

session = HTMLSession()

r = session.get(url,headers=myheader)

pageurls=r.html.find("[class=postlist] li span a")

for pageurl in pageurls:

down_mzt(pageurl.attrs['href'])

if __name__ == '__main__':

start = datetime.datetime.now().replace(microsecond=0)

for i in range(1,2):

get_mztrl("https://www.mzitu.com/page/{}/".format(i))

#down_mzt("https://www.mzitu.com/194229")

end = datetime.datetime.now().replace(microsecond=0)

print("下载完成，用时: ", end='')

print(end-start)

倒数第6行for i in range(1,2)，修改数字改变起始页和结束页，下载图片保存在d:/TS/img文件夹中（自动创建文件夹）

下载综合症

Python妹子图多线程爬虫源码

九天

近期文章

分类目录

标签

近期评论

九天系列

热门文章

联系我