python crawling pull hook net Raiders

Ado, directly on the code and data into Mongdb

Import Requests
 Import pymongo
 Import Time
 Import Random 

mycon = pymongo.MongoClient ( ' 127.0.0.1 ' , 27017)   # establish a connection 
mydb = mycon [ ' lagou_data ' ]                       # Set library name 


class LaGouSpider ():
     DEF  the __init__ (Self, City, KD ): 
        self.headers = {
             ' the User-- Agent ' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
            'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
        }
        self.city = city
        self.max_pn =  1
        self.kd =  kd


    def get_start(self):
        mycol = mydb[self.kd]  # 设置集合名
        url = "https://www.lagou.com/jobs/positionAjax.json?city="+ self.city +"&needAddtionalResult=false"
        for page in range(1,10):
            data = {
                'first': 'true',
                'pn': page,
                'kd': self.kd
            }
            s = requests.Session()
            s.get(url = "https://www.lagou.com/jobs/list_python%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88?labelWords=&fromSearch=true&suginput=",headers =self.headers) 
            Cookies = s.cookies 
            Response = s.post (URL = URL, Data = Data, = Cookies Cookies, headers = self.headers) .json () 
            Content = response.get ( ' Content ' )
             IF Content: 
                Result = Content [ ' positionResult ' ] [ ' Result ' ]
                 Print ( ' position: {}, city: {}, the start of crawling: p {} \ n- ' .format (self.kd, self.city, Page))
                 for i in the Result:
                    lagou_data = {} 
                    lagou_data [ ' positionName ' ] = I [ ' positionName ' ]         # Post name 
                    lagou_data [ ' companyFullName ' ] = I [ ' companyFullName ' ]   # wholly name 
                    lagou_data [ ' workYear ' ] = I [ ' workYear ' ]                 # work experience 
                    lagou_data [ ' Education '] = I [ ' Education ' ]               # Academic requirements 
                    lagou_data [ ' jobNature ' ] = I [ ' jobNature ' ]               # nature 
                    lagou_data [ ' the salary ' ] = I [ ' the salary ' ]                     # pay 
                    lagou_data [ ' City ' ] = I [ ' City ']                         # City 
                    lagou_data [ 'financeStage ' ] = i [ ' financeStage ' ]         # financial stage 
                    lagou_data [ ' industryField ' ] = i [ ' industryField ' ]       # Business 
                    lagou_data [ ' companyShortName ' ] = i [ ' companyShortName ' ] # simple name company 
                    lagou_data [ ' positionAdvantage ' ] = I [ ' positionAdvantage ' ] # Post advantages
                    lagou_data [ ' companySize ' ] = I [ ' companySize ' ]           # Company size 
                    lagou_data [ ' companyLabelList ' ] = I [ ' companyLabelList ' ] # Post treatment tag 
                    lagou_data [ ' District ' ] = I [ ' District ' ]                 # Area 
                    lagou_data [ ' positionLables ' ] = I [ ' positionLables ']     # Technical scope of the tag 
                    lagou_data [ ' firstType ' ] = I [ ' firstType ' ]               # Post type 
                    lagou_data [ ' createTime ' ] = I [ ' createTime ' ]             # Name 
                    Print (lagou_data) 
                    mycol.insert (lagou_data) 
            the time.sleep (random.uniform ( 3,7))                                # random sleep 



IF  the __name__ == ' __main__ ' : 
    lagouLaGouSpider = ( '北京' , ' python ' ) 
    lagou.get_start ()

Description: pull hook net Anti-climb general, it is to obtain information on the search pages cookies, and then added to the json data interface returned.

 

Guess you like

Origin www.cnblogs.com/lvye001/p/11307740.html