import requests
from bs4 import BeautifulSoup
from math import ceil
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
# 获取岗位页数
def getJobPage(url):
ret = requests.get(url, headers=header)
ret.encoding = "utf-8" # 解决乱码问题
html = ret.text
soup = BeautifulSoup(html, 'html.parser')
# 获取岗位总数,< span class ="lightblue total" > 512 < / span >
totalJob = soup.select('span[class="lightblue total"]')[0].text
jobPage = ceil(int(totalJob) / 10)
return jobPage
def getJobOrder(url):
ret = requests.get(url, headers=header)
ret.encoding = "utf-8" # 解决乱码问题
html = ret.text
soup = BeautifulSoup(html, 'html.parser')
# 工作职责
jobRequests = soup.select('ul[class="squareli"]')[0].text
# 工作要求
jobOrder = soup.select('ul[class="squareli"]')[1].text
return jobRequests, jobOrder
# 获取岗位信息
def getJobInfo(url):
myfile = open("tencent_job.txt", "a", encoding='gb18030', errors='ignore') # 解决乱码问题
ret = requests.get(url, headers=header)
ret.encoding = "utf-8" # 解决乱码问题
html = ret.text
soup = BeautifulSoup(html, 'html.parser')
jobList = soup.find_all('tr', class_=['even', 'odd'])
for job in jobList:
# url
jobUrl = "https://hr.tencent.com/" + job.select('td:nth-of-type(1) > a')[0]['href']
# 职位名称
jobName = job.select('td:nth-of-type(1) > a')[0].text
# 人数
jobPeople = job.select('td:nth-of-type(3)')[0].text
# 地点
jobAddre = job.select('td:nth-of-type(4)')[0].text
# 发布时间
jobTime = job.select('td:nth-of-type(5)')[0].text
# 工作职责
jobRequests = getJobOrder(jobUrl)[0]
# 工作要求
jobOrder = getJobOrder(jobUrl)[1]
#print(jobName, jobUrl, jobAddre, jobPeople, jobTime, jobRequests, jobOrder)
tt = jobName + " " + jobUrl + " " + jobAddre + " " + jobPeople + " " + jobTime + " " + jobRequests + " " + jobOrder
myfile.write(tt + "\n")
if __name__ == '__main__':
mainurl = 'https://hr.tencent.com/position.php?keywords=python'
jobPage = getJobPage(mainurl)
print(jobPage)
for page in range(jobPage):
pageUrl = 'https://hr.tencent.com/position.php?keywords=python&start=' + str(page * 10) + '#a'
print("第" + str(page + 1) + "页")
getJobInfo(pageUrl)
# -*- coding:utf-8 -*-
import requests, json, time
from bs4 import BeautifulSoup
class tencent_hr(object):
def __init__(self):
self.base_url = "http://hr.tencent.com/position.php?"
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
self.item_list = []
self.page = 0
# 发送请求
def send_request(self, url, params={}):
time.sleep(2)
try:
response = requests.get(url, params=params, headers=self.headers)
return response.content
except Exception as e:
print e
# 解析数据
def parse_data(self, data):
# 初始化
bs = BeautifulSoup(data, 'lxml')
# 获取标签-结果为列表
data_list = bs.select('.even, .odd')
# 将结果中的每一行数据提取出来
for data in data_list:
data_dict = {}
data_dict['work_name'] = data.select('td a')[0].get_text()
data_dict['work_type'] = data.select('td')[1].get_text()
data_dict['work_count'] = data.select('td')[2].get_text()
data_dict['work_place'] = data.select('td')[3].get_text()
data_dict['work_time'] = data.select('td')[4].get_text()
# 将每条字典数据添加进列表
self.item_list.append(data_dict)
# 判断是否是最后一页,条件:是否有noactive值
# 先找到下一页的标签
next_label = bs.select('#next')
# 根据标签获取属性class的值-返回结果为列表
judge = next_label[0].get('class')
return judge
# 写入文件
def write_file(self):
# 将列表转换成字符串
data_str = json.dumps(self.item_list)
with open('04tencent_hr.json', 'w') as f:
f.write(data_str)
# 调度运行
def run(self):
while True:
# 拼接参数
params = {
"keywords": "python",
"tid": "0",
"lid": "2156",
"start": self.page,
}
# 发送请求
data = self.send_request(self.base_url, params=params)
# 解析数据
judge = self.parse_data(data)
self.page += 10
print self.page
# 如果到了最后一页,出现noactive,跳出循环
if judge:
break
self.write_file()
if __name__ == '__main__':
spider = tencent_hr()
spider.run()