python 爬取mm信息
短信预约 -IT技能 免费直播动态提醒
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import sys
import re
reload(sys)
sys.setdefaultencoding('utf-8')
for num in range(1,4300):
try:
URL = 'http://mm.taobao.com/json/request_top_list.htm?page=%d' % num
#print "现在爬取的网站url是:" + URL
response = requests.get(URL)
response.encoding = 'gb2312'
text = response.text
soup = BeautifulSoup(text, 'lxml')
for model in soup.select(".list-item"):
try:
model_id = model.find('span', {'class': 'friend-follow J_FriendFollow'})['data-userid']
json_url = "http://mm.taobao.com/self/info/model_info_show.htm?user_id=%d" % int(model_id)
response_json = requests.get(json_url)
response_json.encoding = 'gb2312'
text_response_json = response_json.text
soup_json = BeautifulSoup(text_response_json, 'lxml')
print "***********************************" + model.find('a', {'class': 'lady-name'}).string + "*********************************"
print "模特的名字:" + model.find('a', {'class': 'lady-name'}).string
print "模特的年龄:"+ model.find('p', {'class': 'top'}).em.strong.string
print "生日:" + soup_json.find('li', {'class': 'mm-p-cell-left'}).span.string
blood = soup_json.find_all('li', {'class': 'mm-p-cell-right'})[1].span.string
if blood is None:
blood = "无"
print "血型:" + blood
print "学校/专业:" + soup_json.find_all('li')[5].span.string
print "身高:" + soup_json.find('li', {'class': 'mm-p-small-cell mm-p-height'}).p.string
print "体重:" + soup_json.find('li', {'class': 'mm-p-small-cell mm-p-weight'}).p.string
print "三围:" + soup_json.find('li', {'class': 'mm-p-small-cell mm-p-size'}).p.string
print "罩杯:" + soup_json.find('li', {'class': 'mm-p-small-cell mm-p-bar'}).p.string
print "鞋码:" + soup_json.find('li', {'class': 'mm-p-small-cell mm-p-shose'}).p.string
print "模特所在地:"+ model.find('p', {'class': 'top'}).span.string
print "模特的id:"+ model.find('span', {'class': 'friend-follow J_FriendFollow'})['data-userid']
print "模特的标签:"+ model.find_all('p')[1].em.string
print "模特的粉丝数:"+ model.find_all('p')[1].strong.string
print "模特的排名:"+ [text for text in model.find('div', {'class': 'popularity'}).dl.dt.stripped_strings][0]
print model.find('ul', {'class': 'info-detail'}).get_text(" ",strip=True)
print "模特的个人资料页面:" +"http:"+ model.find('a', {'class': 'lady-name'})['href']
print "模特的个人作品页面:" +"http:"+ model.find('a', {'class': 'lady-avatar'})['href']
print "模特的个人头像:" + "http:" + model.find('img')['class="lazy" data-src']
print "***********************************" + model.find('a', {'class': 'lady-name'}).string + "*********************************"
print "\n"
except:
print "error"
except:
print num + "page is error"
免责声明:
① 本站未注明“稿件来源”的信息均来自网络整理。其文字、图片和音视频稿件的所属权归原作者所有。本站收集整理出于非商业性的教育和科研之目的,并不意味着本站赞同其观点或证实其内容的真实性。仅作为临时的测试数据,供内部测试之用。本站并未授权任何人以任何方式主动获取本站任何信息。
② 本站未注明“稿件来源”的临时测试数据将在测试完成后最终做删除处理。有问题或投稿请发送至: 邮箱/279061341@qq.com QQ/279061341