2月13日学习记录

1,背诵单词:stem 茎,干  cruel 残忍的,残酷的   dump 倾倒,倾卸   restless 得不到休息的  such 这样的;上述的  strain 拉紧;紧张;扭伤  pregnant 怀孕的,孕育的,充满的  conversation 会话,谈话  scale 刻度;天平   notorious 臭名昭著的,声名狼藉的   reproach 责备,指责   liver 肝,肝脏   amend 修改,修订  vicious 恶毒的,凶残的 donkey .驴子;蠢人   dialect  方言   moist 潮湿的,湿润的  mischief 损害,伤害,危害   outlet 出路,出口   bud 芽,花苞   metropolitan 首都的,主要都市的

2,做从url文件爬取每个信件网页内容并存入文件

  今天我依然没有实现使用webmagic爬取多个网页,因此用python循环爬取的每个信件内容,每条信件之间用$$间隔,每条信件不同部分用&&间隔

import requests;
import re
import traceback
from lxml import html
from lxml import etree
from _multiprocessing import send


def getHTMLText(url):
    access={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0"}
    try:
        r=requests.get(url,headers=access)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return ""


def get_xinjian_massage(lst,url,fpath):
    print(url)
    xinjian_html=getHTMLText(url)
    soup=etree.HTML(xinjian_html)
    Id=soup.xpath("//a[@class='dex_yes font12']/@onclick")
    if "".join(Id)=="":
        Id="".join(Id)
    else:
        Id=re.search(r"AH[0-9]{11}","".join(Id))
        Id=Id.group(0)
    type=soup.xpath("//meta[@name='Keywords']/@content")
    type=re.search(r"^[^x00-xff]{2}","".join(type))
    if "".join(type)=="":
        type="".join(type)
    else:
        type=type.group(0)
    statue=str("yes")
    massage=soup.xpath("//div[@class='col-xs-10 col-sm-10 col-md-10 o-font4 my-2']/strong/text()")
    massage="".join(massage)
    addresser=soup.xpath("//div[@class='col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted ']/text()")
    if "".join(addresser)=="":
        addresser="".join(addresser)
    else:
        addresser="".join(addresser)
        addresser=re.search(r"(?<=来信人:).*","".join(addresser.split()))
        addresser=addresser.group(0)
    send_time=soup.xpath("//div[@class='col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted ']/text()")
    send_time=re.search(r"[0-9]{4}-[0-9]{2}-[0-9]{2}","".join(send_time))
    send_time=send_time.group(0)
    asker=soup.xpath("//div[@class='col-xs-4 col-lg-3 col-sm-3 col-md-3 text-muted ']/label/text()")
    asker=re.search(r"[0-9]+","".join(asker))
    asker=asker.group(0)
    details=soup.xpath("//div[@class='col-xs-12 col-md-12 column p-2 text-muted mx-2']/text()")
    details="".join(details)
    details="".join(details.split())
    answering=soup.xpath("//div[@class='col-xs-9 col-sm-7 col-md-5 o-font4 my-2']/text()")
    answering="".join(answering)
    answering="".join(answering.split())
    answer_time=soup.xpath("//div[@class='col-xs-12 col-sm-3 col-md-3 my-2 ']/text()")
    if "".join(answer_time)=="":
        answer_time="".join(answer_time)
    else:
        answer_time=re.search(r"[0-9]{4}-[0-9]{2}-[0-9]{2}","".join(answer_time))
        answer_time=answer_time.group(0) 
    reply=soup.xpath("//div[@class='col-xs-12 col-md-12 column p-4 text-muted my-3']/text()")
    reply="".join(reply)
    reply="".join(reply.split())
    xinjian=Id+"&&"+send_time+"&&"+type+"&&"+massage+"&&"+statue+"&&"+addresser+"&&"+asker+"&&"+details+"&&"+answering+"&&"+answer_time+"&&"+reply+"$$"
    print(xinjian)
    save__file(fpath,xinjian)
    return xinjian

def save__file(file_path,msg):
    f=open(file_path,"a",encoding='utf-8')
    f.write(msg)
    f.close
        
def Run(out_put_file,fpath):
    urls=""
    lsts=[]
    lst=[]
    cond=0
    with open(out_put_file,"r") as f:
        urls=f.read()
    lsts=urls.split(",")
    for i in lsts:
        xinjian=get_xinjian_massage(lst,i,fpath) 
        lst.append(xinjian)
        cond+=1
        print("\nspeed:{:.2f}%".format(cond*100/len(lsts)),end="")
    return lst
    

def main():
    fpath="outfile/xinjian.txt"
    out_put_file="outfile/xj_list.txt"
    lst=[]
    lst=Run(out_put_file, fpath)
    for i in lst:
        print(i)
    

main()

3,遇到的问题:我读取存入文件的信件内容信息时发现用split不知道怎么分割的效果不太好,很多地方并不是用指定的分隔符分割的

4,明天计划:尝试使用spark切分信件内容并将数据写入MySQL

猜你喜欢

转载自www.cnblogs.com/lq13035130506/p/12305796.html