爬取笔趣阁小说(一念永恒)

!:编码格式。编码格式。编码格式

!!:http://xiaorui.cc/2016/02/19/%E4%BB%A3%E7%A0%81%E5%88%86%E6%9E%90python-requests%E5%BA%93%E4%B8%AD%E6%96%87%E7%BC%96%E7%A0%81%E9%97%AE%E9%A2%98/

!!!:https://www.zhihu.com/question/264878732

!!!!:xx.apparent_encoding

import requests
from bs4 import BeautifulSoup
import re
import sys
article={}
ll=[]
def getlink(url):
    res=requests.get(url)
    res.encoding = res.apparent_encoding
    soup = BeautifulSoup(res.text,'html.parser')
    for i in range(12,20):
        t1=soup.find_all('dd')[i]
        for t2 in t1:
            t3 = t2.get('href')
            #print(t3)
            ll.append(t3)
    return ll
def gettext(url):
    res=requests.get(url)
    res.encoding = res.apparent_encoding
    li=[]
    soup = BeautifulSoup(res.text,'html.parser')
    li=getlink(url)
    filename=soup.select('.info h2')[0].text
    #print(filename)
    #print(type(filename))
    #with open("%s.txt" %filename ,'wb+') as f
    f = open("%s.txt" %filename ,'a')
    for k in range(0,3):
        #print(li[k])
        link='http://www.biqukan.com'+li[k]
        t=requests.get(link)
        t.apparent_encoding #t.encoding='gbk'
        st = BeautifulSoup(t.text,'html.parser')
        article['title']=st.select('.content h1') [0].text
        article['content'] = st.select('.showtxt') [0].text.replace('\r',' ').replace('\u3000','').replace('\xa0','').rstrip('http://www.biqukan.com/1_1094/17967679.html请记住本书首发域名:www.biqukan.com。笔趣阁手机版阅读网址:m.biqukan.com')
        #print(type(article['title']))
        #print(article['content'])
        f.write(article['title']+'\n')
        f.write(article['content']+'\n')
    f.close()
url='http://www.biqukan.com/1_1094/'
gettext(url)

猜你喜欢

转载自www.cnblogs.com/leolaosao/p/9095746.html