Source:
# coding=utf-8 import requests from lxml import etree class ChaxunSpdier: def __init__(self): self.start_url = 'http://111.40.232.237:9000/eoms35/sheet/complaint/complaint.do?method=performQuery' self.part_url = 'http://111.40.232.237:9000/eoms35/sheet/complaint/' self.headers = { 'Connection': 'keep-alive', 'Cookie': ' TSJSESSIONID = 0000YvxNFfPYx8EBo8lsKNrKIl6: 1bkt8lo7d ' , # each had a change ' Host ' : ' 111.40.232.237:9000 ' , ' Referer ' : ' http://111.40.232.237:9000/eoms35/sheet/complaint/complaint ? .do Method, = showQueryPage & of the type = interface & urlType = Complaint & userName = liuhaoce & workSerial = 0 & isDutyMaster = false & workSerialTime = & startDuty = & endDuty = ' , ' the User-Agent ' : ' Mozilla / 5.0 (Windows NT 6.1; Win64; x64) AppleWebKit / 537.36 (KHTML, like Gecko ) Chrome / 80.0.3987.116 Safari/537.36'} def parse_url(self, url): formdata = { 'sheetIdStringExpression': 'like', 'main.sheetId': '', # 工单流水号 'titleStringExpression': 'like', 'main.title': '', 'main.status': '', 'statusChoiceExpression': '0', 'task.taskName': '', 'sendRoleIdStringExpression': 'in', 'main.sendRoleId': '', 'sendDeptIdStringExpression': 'in', 'main.sendDeptId': '', 'sendUserIdStringExpression': 'in', 'main.sendUserId': '', 'operateRoleIdStringExpression': 'in', 'link.operateRoleId': '', 'operateDeptIdStringExpression': 'in', 'link.operateDeptId': '', 'operateUserIdStringExpression': 'in', 'link.operateUserId ' : ' ' , ' toDeptIdStringExpression ' : ' in ' , ' showArea ' : ' Daqing, China Railcom ' , # complaints provinces ' main.toDeptId ' : ' 1005, 1021 ' , ' main.complaintType1 ' : '' , ' complaintType1ChoiceExpression ' : ' 1010615100202 ', #Type a Complaint: Jiakuan business ' main.complaintType2 ' : '' , ' complaintType2ChoiceExpression ' : '' , ' main.complaintType ' : '' , ' main.complaintType4 ' : '' , ' main.complaintType5 ' : '' , ' main.complaintType6 ' : '' , ' main.complaintType7 ' :'', 'complaintNumStringExpression': '', 'main.complaintNum': '', 'parentCorrelationStringExpression': '', 'main.parentCorrelation': '', 'customAttributionStringExpression': 'like', 'main.customAttribution': '', 'repeatComplaintTimesStringExpression': '>=', 'main.repeatComplaintTimes': '', 'complaintDescStringExpression': 'like', 'main.complaintDesc': '', 'main.sendTime': '', 'sendTimeStartDateExpression': '>=', 'sendTimeStartDate': '2020-02-02 20:13:35 ' , # start time ' sendTimeLogicExpression ' : ' and ' , ' sendTimeEndDateExpression ' : ' <= ' , ' sendTimeEndDate ' : ' 2020-02-23 20:13:35 ' , # end time ' queryType ' : ' Record ' } Response = requests.post (URL, Data = FormData, headers = Self. headers) returnresponse.content DEF get_content_list (Self, html_raw): HTML = etree.HTML (html_raw) tr_list = html.xpath ( ' // tbody / tr ' ) # each tr put a line of complaints CONTENT_LIST = [] for Content in tr_list : Item = {} zineirong = content.xpath ( ' ./td ' ) # each row complaints are packaged under the n td tag Item [ ' ticket subject ' ] = zineirong [0] .xpath ( ' .//text () ')[0] item['工单流水号'] = zineirong[1].xpath('./a/text()')[0] # item['处理时限'] = zineirong[3].xpath('./text()')[0] detail_link = self.part_url + zineirong[1].xpath('./a/@href')[0] detail_dict = self.get_gongdan_detail(detail_link) item['xiangqing'] = detail_dict content_list.append(item) next_gongdan_url = self.part_url + html.xpath("//a[text()='下一页']/@href") [0] IF len (html.xpath ( " // A [text () = 'Next'] / @ the href " ))> 0 the else None # Next ticket listing details return CONTENT_LIST, next_gongdan_url DEF get_gongdan_detail (Self, URL): html_raw = self.parse_url (URL) HTML = etree.HTML (html_raw) xiangqing_dict = {} xiangqing_dict [ ' complaint ' ] = html.xpath ( ' // * [@ ID = "complainttext"] / text () ' ) xiangqing_dict [ ' sent to the object "] = html.xpath('//div[@id="ext-gen47"]/table/tbody/tr[4]/td[4]/text()')#ifram里了,查不到 xiangqing_dict['qita'] = html.xpath('//*[@id="ext-gen47"]/text()') return xiangqing_dict def save_content_list(self, content_list): for i, v in enumerate(content_list, start=1): print(i, v) def run(self): next_url = self.start_url#工单查询主界面 content_total_list = [] the while next_url IS not None: html_raw = self.parse_url (next_url) # get access to the source data for each page work order CONTENT_LIST, next_url = self.get_content_list (html_raw) # extract specific content on the url in the list, the Next Page link + = content_total_list CONTENT_LIST content_total_list # extract the contents of each page loaded into the list self.save_content_list (content_total_list) # every single print of your content work IF __name__ == ' __main__ ' : Spdier = ChaxunSpdier () Spdier.run ()