# -*-coding:utf8-*-
import requests
from bs4 import BeautifulSoup
import time
import os
import urllib
import re
import json
requests.packages.urllib3.disable_warnings()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
}
proxies = {"http": "**********************",
"https": "********************8"}
def get_bs(url):
res = requests.get(url, proxies=proxies,headers=headers,verify=False)
bs = BeautifulSoup(res.content, 'lxml')
return bs
def get_first_url():
first_url_list = []
page = 1
for i in range(page):
root_url = "https://www.model61.com/mold.php?page={}".format(str(i+1))
bs = get_bs(root_url)
for i in bs.select("dt a"):
class="lazy" data-src = i.get('href')
if "php" in class="lazy" data-src:
first_url = "https://www.model61.com/{}".format(class="lazy" data-src)
first_url_list.append(first_url)
return first_url_list
def get_second_url(first_url):
data = {}
bs = get_bs(first_url)
for i in bs.select(".cont-top a"):
class="lazy" data-src = i.get('href')
if "album_s" in class="lazy" data-src:
second_url = "https://www.model61.com/{}".format(class="lazy" data-src)
#print("second_url",second_url)
data["second_url"] = second_url
for j in bs.select(".content_center_date"):
data["identity"] = j.get_text()
return data
def get_thred_url(second_url):
bs = get_bs(second_url)
for i in bs.select("dt a"):
class="lazy" data-src = i.get('href')
if "album_list" in class="lazy" data-src:
thred_url = "https://www.model61.com/{}".format(class="lazy" data-src)
#print("thred_url", thred_url)
return thred_url
def get_image_list(thred_url):
image_list = []
bs = get_bs(thred_url)
for i in bs.select(".album_list_left a")+bs.select(".album_list_right a"):
class="lazy" data-src = i.get('href')
image_path = "https://www.model61.com/{}".format(class="lazy" data-src)
image_list.append(image_path)
#print("image_path",image_path)
return image_list
def download_image(image_path,image_url):
try:
r = requests.get(image_url, proxies=proxies, headers=headers, verify=False, allow_redirects=False)
with open(image_path, 'wb') as f:
f.write(r.content)
except Exception as e:
print(e)
def create_face_id(data):
save_path = r""
identity = data["identity"]
ld_list = identity.split("\n")
identity = ld_list[1] + '_' + ld_list[3][4:] + "_" + ld_list[7][6:] + '_' + ld_list[8][4:]
print(identity)
identity_path = os.path.join(save_path, identity)
if not os.path.exists(identity_path):
os.mkdir(identity_path)
for image_url in data['image_list']:
image_path = os.path.join(identity_path, '{}.jpg'.format(str(int(time.time() * 1000))))
download_image(image_path, image_url)
if __name__ == '__main__':
first_url_list = get_first_url()
for first_url in first_url_list:
try:
data = get_second_url(first_url)
print(data)
second_url = data['second_url']
thred_url = get_thred_url(second_url)
image_list = get_image_list(thred_url)
data["image_list"] = image_list
create_face_id(data)
except Exception as e:
print(first_url,e)
Python-爬虫小计
短信预约 -IT技能 免费直播动态提醒
免责声明:
① 本站未注明“稿件来源”的信息均来自网络整理。其文字、图片和音视频稿件的所属权归原作者所有。本站收集整理出于非商业性的教育和科研之目的,并不意味着本站赞同其观点或证实其内容的真实性。仅作为临时的测试数据,供内部测试之用。本站并未授权任何人以任何方式主动获取本站任何信息。
② 本站未注明“稿件来源”的临时测试数据将在测试完成后最终做删除处理。有问题或投稿请发送至: 邮箱/279061341@qq.com QQ/279061341