python crawling boss directly employed jobs to achieve

1 preparation, some public methods

Access to the database link:

Import pymysql
 # get the database link object 
DEF getConnect (Database): 
    DATABASE = {
         ' Host ' : ' localhost ' ,
         ' Database ' : Database,
         ' User ' : ' the root ' ,
         ' password ' : ' 123456 ' 
    } 
    return pymysql.connect (** DATABASE)

Get the page soup objects:

Import Requests
 from BS4 Import the BeautifulSoup
 # converted into soup objects 
DEF to_soup (STR):
     return the BeautifulSoup (STR, ' lxml ' )
 # get the page object via the url and soup header 
DEF get_soup (url, header): 
    Response = requests.get (url , headers = header)
     return to_soup (response.text)

2, a linear employed to achieve crawling BOSS related positions python

Work information object definitions:

class WorkInfo:
    def __init__(self, title, salary, site, experience, education, job_url, company,release_date,get_date):
        self.title = title
        self.salary = salary
        self.site = site
        self.experience = experience
        self.education = education
        self.job_url = job_url
        self.company = company
        self.release_date = release_date
        self.get_date = get_date

Obtain job information to a set of object definitions:

# Obtain job information collection 
DEF getWorkInfos (url, header):
         # get a page soup target 
        htmlSoup = rep.get_soup (url, header) 
        workInfos = []
         # get the page content block list 
        job_infos = htmlSoup.find_all ( ' div ' , class_ = ' Job-Primary ' )
         IF len (job_infos) == 0:
             Print ( ' has a blank page !!! ' )
             return workInfos
         # traversing each block, each acquiring detailed class content 
        Print ( ' start page data crawling!')
        for job_info_soup in job_infos:
                # 标题
                title = job_info_soup.find('div', class_='job-title').get_text()
                # 薪资
                salary = job_info_soup.find('span', class_='red').get_text()
                infos = str(job_info_soup.find('p'))
                infosList = tool.toContent(infos)
                #Work address 
                Site = infosList [0]
                 # experience 
                preference Experience infosList = [. 1 ]
                 # Academic requirements 
                Education = infosList [2 ]
                 # links to detailed information 
                job_url job_info_soup.find = ( ' A ' ) .get ( ' the href ' )
                 # company name 
                = job_info_soup.find Company ( ' div ' , the class_ = ' Company-text ' ) .find ( ' A ') .get_text ()
                 # Published 
                release_date = job_info_soup.find ( ' div ' , class_ = ' info-Publis ' ) .find ( ' the p- ' ) .get_text () [3 :]
                 # stitching acquisition date database character in line with the requirements of string 
                IF  ' yesterday '  in RELEASE_DATE: 
                    RELEASE_DATE = The time.strftime ( " % Y-M-% D% " , time.localtime (the time.time () - 86400 ))
                 elif  ' : '  in  RELEASE_DATE:
                    RELEASE_DATEThe time.strftime = ( " % Y-M-% D% " )
                 the else : 
                     RELEASE_DATE = STR (. time.localtime () tm_year) + ' - ' + the re.sub (R & lt ' [month, day] ' , ' - ' , RELEASE_DATE) [: -. 1 ]
                 # acquired data time 
                get_date the time.strftime = ( " % D%% Y-M-% H:% M:% S " ) 
                workInfo = WorkInfo(title, salary, site, experience, education, job_url, company, release_date, get_date)
                workInfos.append (workInfo) 
        Print ( ' crawling page data is completed! ' )
         return workInfos

To get to work collection of information stored in the database:

# 存入数据库
def toDatabase(workInfos):
    print('开始存入数据库')
    db = database.getConnect('reptile')
    cursor = db.cursor()
    for workInfo in workInfos:
        sql = "INSERT INTO `work_info` (`title`, `salary`, `site`, `experience`, `education`, `job_url`, `company`, `release_date`, `get_date`)" \
          " VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s')" \
          % (workInfo.title, workInfo.salary, workInfo.site, workInfo.experience, workInfo.education, workInfo.job_url, workInfo.company, workInfo.release_date,workInfo.get_date)
        cursor.execute(sql)
    cursor.close()
    db.commit()
    db.close()
    print('存入数据库完毕!')

Crawling work to achieve:

url = 'https://www.zhipin.com/c101270100/?'
header = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    'referer': '',
    'cookie':'lastCity=101270100; _uab_collina=155876824002955006866925; t=DPiicXvgrhx7xtms; wt=DPiicXvgrhx7xtms; sid=sem_pz_bdpc_dasou_title; __c=1559547631; __g=sem_pz_bdpc_dasou_title; __l=l=%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_title&r=https%3A%2F%2Fsp0.baidu.com%2F9q9JcDHa2gU2pMbgoY3K%2Fadrc.php%3Ft%3D06KL00c00fDIFkY0IWPB0KZEgsZb1OwT00000Kd7ZNC00000JqHYFm.THdBULP1doZA80K85yF9pywdpAqVuNqsusK15yF9m1DdmWfdnj0sm1PhrAf0IHYYnD7aPH9aPRckwjRLrjbsnYfYfWwaPYwDnHuDfHcdwfK95gTqFhdWpyfqn1czPjmsPjnYrausThqbpyfqnHm0uHdCIZwsT1CEQLILIz4lpA-spy38mvqVQ1q1pyfqTvNVgLKlgvFbTAPxuA71ULNxIA-YUAR0mLFW5HRvnH0s%26tpl%3Dtpl_11534_19713_15764%26l%3D1511867677%26attach%3Dlocation%253D%2526linkName%253D%2525E6%2525A0%252587%2525E5%252587%252586%2525E5%2525A4%2525B4%2525E9%252583%2525A8-%2525E6%2525A0%252587%2525E9%2525A2%252598-%2525E4%2525B8%2525BB%2525E6%2525A0%252587%2525E9%2525A2%252598%2526linkText%253DBoss%2525E7%25259B%2525B4%2525E8%252581%252598%2525E2%252580%252594%2525E2%252580%252594%2525E6%252589%2525BE%2525E5%2525B7%2525A5%2525E4%2525BD%25259C%2525EF%2525BC%25258C%2525E6%252588%252591%2525E8%2525A6%252581%2525E8%2525B7%25259F%2525E8%252580%252581%2525E6%25259D%2525BF%2525E8%2525B0%252588%2525EF%2525BC%252581%2526xp%253Did(%252522m3224604348_canvas%252522)%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FH2%25255B1%25255D%25252FA%25255B1%25255D%2526linkType%253D%2526checksum%253D8%26wd%3Dboss%25E7%259B%25B4%25E8%2581%2598%26issp%3D1%26f%3D3%26ie%3Dutf-8%26rqlang%3Dcn%26tn%3Dbaiduhome_pg%26oq%3D%2525E5%25258D%25259A%2525E5%2525AE%2525A2%2525E5%25259B%2525AD%26inputT%3D9649%26prefixsug%3Dboss%26rsp%3D0&g=%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_title; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1558768262,1558768331,1559458549,1559547631; JSESSIONID=A0FC9E1FD0F10E42EAB681A51AC459C7;'
             ' __a=86180698.1558768240.1559458549.1559547631.63.3.6.6; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1559551561'
              'referer: https://www.zhipin.com/c101270100/?query=python&page=2&ka=page-2'
}
query='python'
page=1
while True:
    print("开始第:{} 页".format(page))
    purl=url+'query='+query+'&page='+str(page)+'&ka=page-'+str(page)
    workInfos = getWorkInfos(purl, header)
    if len(workInfos)==0:
        print('结束爬取!')
        break
    toDatabase(workInfos)
    page=page+1

3, involving little knowledge

Homemade fetch html tags, the inclusion in the label contents stored in the list:

# Regular expression remove HTML tags, acquires a list of text within the tag 
DEF toContent (STR): 
    the infos = re.split ( ' <[^>] *> ' , STR)
     # remove empty elements 
    return List (filter (None , infos))

Time-related operations

With '-' replacement 'May' 'date':

re.sub(r'[月,日]', '-', release_date)

The day before the acquisition ':

release_date=time.strftime("%Y-%m-%d",time.localtime(time.time()-86400))

 



Guess you like

Origin www.cnblogs.com/tutuwowo/p/10975003.html