爬虫私教课-实习僧

答疑·限时优惠

如果,你想让我看见你的疑问并且百分之百的回答。可以加入我的知识星球。
AI悦创·进化岛
AI悦创·进化岛

你好,我是悦创。

课程源码:
课程源码修改成了多线程

"""
project = 'Code', file_name = 'spider', author = 'AI悦创'
time = '2020/4/11 11:01', product_name = PyCharm
# code is far away from bugs with the god animal protecting
    I love animals. They taste delicious.
"""
import time
import httpx
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import threading

headers ={
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36',
    'cookie': '__jsluid_s=cd83b4d69f4ab44cb06be1accb4cc1f0; gr_user_id=0d038710-8b3c-458e-b24e-4dfddf81678d; SXS_XSESSION_ID="2|1:0|10:1586252279|15:SXS_XSESSION_ID|48:NjQwNTlmYzgtN2UwOC00MjVhLTkwYjgtMmVkZTg3ZThmOTlm|177fd855591d1cb18419b934149eac85a5dc019bc405e5766dbcc607d7a76b87"; SXS_XSESSION_ID_EXP="2|1:0|10:1586252279|19:SXS_XSESSION_ID_EXP|16:MTU4NjMzODY3OQ==|f0c98dcc2df131c89b89fcf82003600d2f6d3c5b9bb413d773d5b0ca1af70781"; uuid=557984a3-f1eb-d32a-bbe5-d1a04d15ed79; search=; Hm_lvt_03465902f492a43ee3eb3543d81eba55=1584630605,1586176927,1586252282,1586259541; MEIQIA_TRACK_ID=1Xz59NchxnvhxGfnzWD8u6imsRB; MEIQIA_VISIT_ID=1aNRrWmHJjsaKO5Id2Rk04j1icd; uid1=e7aa7c2d-a147-9dd4-a247-3044c2794179; uid2=97397d2a-5cfa-0894-ad45-85509228befd; Hm_lpvt_03465902f492a43ee3eb3543d81eba55=1586575274; gr_session_id_96145fbb44e87b47=15495963-3f3c-41d0-aeb0-c81e3bce662b; gr_cs1_15495963-3f3c-41d0-aeb0-c81e3bce662b=user_id%3Anull; gr_session_id_96145fbb44e87b47_15495963-3f3c-41d0-aeb0-c81e3bce662b=true; SXS_VISIT_XSESSION_ID_V3.0="2|1:0|10:1586577044|26:SXS_VISIT_XSESSION_ID_V3.0|48:OTc5YjFmM2MtN2U2OS00MTUxLWE5YzYtOTdlNjFlNWQ0ZDhl|ce71895297d0f00342e9d918c44f42c8b26e661c2d4b34ef8f22df843e8b968f"; SXS_VISIT_XSESSION_ID_V3.0_EXP="2|1:0|10:1586577044|30:SXS_VISIT_XSESSION_ID_V3.0_EXP|16:MTU4OTE2OTA0NA==|d2f94075301f75c91a955a0c7bee0ee0bc7e8f734c01d7bd95d228d4ebb83136"'
}

class ShiXiSeng(object):
    def __init__(self):
        # self.url = url
        self.headers = headers
    
    def requests_func(self, url):
        try:
            result = httpx.get(url, headers = self.headers)
            if result.status_code == 200:
                return result
            return None
        except RequestException:
            return None
    
    def salary_encode(self, salary):
        encode_text = salary.encode("utf8")
        encode_text = encode_text.replace(b"\xee\xa9\xb0", b'0')
        encode_text = encode_text.replace(b"\xee\x90\xa4", b'2')
        encode_text = encode_text.replace(b"\xee\x94\xaa", b'3')
        encode_text = encode_text.replace(b"\xef\xa3\xaf", b'5')
        # encode_text = encode_text.replace(b"\xe5\xa4\xa9", b'天')
        # encode_text = encode_text.replace(b"\xe5\xa4\xa9", b'\xe5\xa4\xa9')
        return encode_text.decode("utf-8")
    
    def parse_page(self, url):
        soup = BeautifulSoup(self.requests_func(url).text, 'lxml')
        offers = soup.select('#__layout .interns .result-list .primary-content div div div.intern-wrap')
        
        for index, offer in enumerate(offers):
            title = offer.select('.f-l p a')[0].text
            salary = offer.select('.f-l p span')[0].text
            # print(self.url)
            numbers = self.salary_encode(salary)
            print(f'{index}title:>{title}, salary:>{numbers}')

    
    
    def main(self):
        urls = ['https://www.shixiseng.com/interns?' \
                'page={}&keyword=Java&type=intern&area=' \
                '&months=&days=&degree=&official=&enterprise=&salary=-0&' \
                'publishTime=&sortType=&city=%E8%8E%86%E7%94%B0&internExtend='.format(page) for page in range(1, 20)]
        for url in urls:
            thread = threading.Thread(target=self.parse_page, args=(url,))
            thread.start()
            thread.join()
        
        
        
if __name__ == '__main__':
    # urls = ['https://www.shixiseng.com/interns?' \
    #       'page={}&keyword=Java&type=intern&area=' \
    #       '&months=&days=&degree=&official=&enterprise=&salary=-0&' \
    #       'publishTime=&sortType=&city=%E8%8E%86%E7%94%B0&internExtend='.format(page) for page in range(1, 20)]
    html = ShiXiSeng()
    html.main()

    
AI悦创·创造不同!
AI悦创 » 爬虫私教课-实习僧

Leave a Reply