oceanwp 去掉cart price








去掉HOME



扫描二维码关注公众号,回复: 140311 查看本文章

#   -*- coding:gbk   -*-


import requests
from bs4 import BeautifulSoup 


import time


import io
import sys
import re
#sys.stdout = io.TextIOWrapper(sys.stdout,encoding='gb18030') #改变标准输出的默认编码  
#print(k)
#kc=str(k.contents).replace(u'\xa0',u' ').replace(u'\u203a',u' ')
#print('\n'.join(['%s'%c for c in b.select('ol ol a')]))
##kc=str(k.contents).encode('gb18030').decode('gbk','ignore')
##         tat=tat+kc[1:len(kc)-1]
#print(tat,file=ff)  #,file=ff




m=requests.get('https://mp.weixin.qq.com/s/WUrnjh_sz91kMQZYcTAFXg')




b=BeautifulSoup(m.text,'lxml')


#python去掉标点,特殊符号
#string = re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+', "",line)




with open('外土司_微信文章.txt','w+') as ff:
    for  link in b.find_all('a'):
        txt=link.get_text().encode('gb18030').decode('gbk','ignore')
        hh=link.get('href')
        if len(txt.strip())>5 :
            print("title: %s  ,   href: %s"%(txt,hh),file=ff)
            print("title: %s  ,   href: %s"%(txt,hh))
            txt=re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?“”!,。?、~@#¥%……&*()]+'","",txt)
            fn=txt+'.html'
            """
            if txt.find('/')>0:
                 fn=txt[:txt.find('/')-1]+'.html'
                 print(fn)
            else:     
                 fn=txt+'.html'
            """     
            print(fn+'  ,  '+hh)
            response=requests.get(hh)
            with open(fn,'bw') as htmlff:
                htmlff.write(response.content)
            
        


""" 
hh=[h.get('href')  for h in b.select('ol ol a')]
tt=[t['title']  for t in b.select('ol ol a')]




w3c='https://www.w3cschool.cn'


with open('外土司_微信文章.txt','w') as ff:
  tat=''
  for h in hh:
      
     hlink=w3c+h
     subb=BeautifulSoup(requests.get(hlink).text,'lxml')
     print(hlink)


     cc=subb.find(id='pro-mian-header').prettify()  #从bs4.element.Tag转到list
     dd=subb.find('div','content-bg').prettify()  #从bs4.element.Tag转到list
     tt=cc+dd
     print(tt.encode('gb18030').decode('gbk','ignore'),file=ff)
     print(''.join([ '-' for i in range(180)]))
    


     time.sleep(5)
 
"""

猜你喜欢

转载自blog.csdn.net/xtjie/article/details/79846900
今日推荐