汽车之家数据爬取:文章链接//图片//标题

(1)打印出来的东西乱码,如何处理这个问题?

import requests

response=requests.get(
    url='https://www.autohome.com.cn/beijing/'          #最新的地址是可以出来的
    # url='https://www.autohome.com.cn/news/'               #老的地址会出现乱码问题
)
print(response.text)

(2)

import requests

response=requests.get(
    # url='https://www.autohome.com.cn/beijing/'          #最新的地址是可以出来的
    url='https://www.autohome.com.cn/news/'               #老的地址会出现乱码问题
)
# response.encoding='utf-8'       #(utf-8)这个地方又做了一下处理:依然部分乱码
response.encoding='gbk'       #(gbk)这个地方又做了一下处理:依然部分乱码

print(response.text)

(3)

import requests
response=requests.get(
    # url='https://www.autohome.com.cn/beijing/'          #最新的地址是可以出来的
    url='https://www.autohome.com.cn/news/'               #老的地址会出现乱码问题
)
# response.encoding='utf-8'       #(utf-8)这个地方又做了一下处理:依然部分乱码
# response.encoding='gbk'       #(gbk)这个地方又做了一下处理:依然部分乱码

response.encoding=response.apparent_encoding   #注意在这里默认就是utf-8
                                               #这里和写gbk是一样的
print(response.text)

(4)

import requests
from bs4 import BeautifulSoup
response=requests.get(
    # url='https://www.autohome.com.cn/beijing/'          #最新的地址是可以出来的
    url='https://www.autohome.com.cn/news/'               #老的地址会出现乱码问题
)

response.encoding=response.apparent_encoding   #注意在这里默认就是utf-8
                                                 #这里和写gbk是一样的
soup=BeautifulSoup(response.text,features='html.parser')       #第一步把文本转换成对象
                                        #后边的features=表示以什么引擎,或者以什么方式转换
                                        #python内置的参数是'html.parser'   #这个是默认的
                                        #python的第三方参数'features='lxml',需要额外安装才能使用
                                        #实际生产中都是会用lxml,性能会更好一些
target=soup.find(id='auto-channel-lazyload-article')
target.find('li')   #根据标签来寻找
#继续寻找
print(target)

(5)目前的最终版(后期有待完善)　　注意注释

import requests
from bs4 import BeautifulSoup
response=requests.get(
    # url='https://www.autohome.com.cn/beijing/'          #最新的地址是可以出来的
    url='https://www.autohome.com.cn/news/'               #老的地址会出现乱码问题
)

response.encoding=response.apparent_encoding   #注意在这里默认就是utf-8
                                                 #这里和写gbk是一样的
soup=BeautifulSoup(response.text,features='html.parser')       #第一步把文本转换成对象
                                        #后边的features=表示以什么引擎,或者以什么方式转换
                                        #python内置的参数是'html.parser'   #这个是默认的
                                        #python的第三方参数'features='lxml',需要额外安装才能使用
                                        #实际生产中都是会用lxml,性能会更好一些
target=soup.find(id='auto-channel-lazyload-article')
# obj=target.find('li')   #根据标签来寻找
                        #只找到一个标签927530<li>

li_list=target.find_all('li')   #找所有的li标签
                                #继续寻找
                                #此时li_list是个列表,
for i in li_list:
    a=i.find('a')
    # print(a.attrs)      #有些标签是没有a标签的,所以报错
    if a:
        print(a.attrs.get('href'))
        txt=a.find('h3')
        print(txt)      #url+文本     #拿到后放到app或者数据库中
        img=a.find('img')
        print(img.get('src'))       #图片链接

(6)

#同学案例       #有问题
import requests
from bs4 import BeautifulSoup
url='https://www.autohome.com.cn/news/'
response=requests.get(url)
response.encoding=response.apparent_encoding
# soup=BeautifulSoup(response.text,'lxml',)   #没有安装所以报错
soup=BeautifulSoup(response.text,'html.parser',)   #没有安装lxml模块所以报错

print(soup.title.text)

#结果:【图】最新汽车新闻_资讯_汽车之家

汽车之家数据爬取:文章链接//图片//标题

猜你喜欢