爬取过程中的问题解决
环境 :Windows + Python 3.7 + selenium
1. class 中含有空格,如何定位
< div class = " u-cover u-cover-1" >
browser. find_elements_by_css_selector( '[class = "u-cover u-cover-1"]' )
2. 定位信息好着,就是获取不到元素
原因:
selenium 打开网页后, 默认是在父级 Frame 里, 直接搜索是搜不到子 Frame 里的信息的。
需要切换 Frame。
方法:
switch_to.frame('frameid')
switch_to.parent_frame()
原因:点击后,浏览器新打开了个选项卡。没有切换过来,导致还是在旧的选项卡里查找
解决:
browser.switch_to_window(browser.window_handles[1])
< iframe name = " contentFrame" id = " g_iframe" class = " g-iframe" scrolling = " auto" frameborder = " 0" src = " about:blank" allowfullscreen = " true" > </ iframe>
browser. switch_to. frame( 'g_iframe' )
3. find_elements后点击不了抓取的元素
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
we_gedans = self. browser. find_elements_by_css_selector( '[class = "u-cover u-cover-1"]' )
for we_gedan in we_gedans:
we_gedan. click( )
官方给出解释如下:
The element has been deleted entirely.
The element is no longer attached to the DOM.
就是页面元素过期,引用的元素过时,不再依附于当前页面,需要重新定位获取元素对象
find_elements 查找到的是 WebElement 类型的数组数据,含有元素在当前页面的地址信息,调用 click() 方法就是使用了此地址信息。
切换页面后,此地址信息就失效了。所以不能在直接在循环中调用 click()
4. eyed3 pip 安装成功, import报错
import magic
File "C:\Users\zuoy\AppData\Local\Programs\Python\Python37\lib\site-packages\magic.py", line 181, in <module>
raise ImportError('failed to find libmagic. Check your installation')
ImportError: failed to find libmagic. Check your installation
原因: eyed3 依赖 magic,必须安装上这个才能使用
pip install pip install python-magic-bin
pip install eyed3
代码
'''
功能:访问网易云音乐网站,下载歌单里的所有免费歌曲
时间:2019/07/20
'''
from selenium import webdriver
from selenium. webdriver. common. by import By
from selenium. webdriver. common. keys import Keys
from selenium. webdriver. support import expected_conditions as EC
from selenium. webdriver. support. wait import WebDriverWait
import requests
import eyed3
import time
import json
import os
class Splider ( object ) :
def __init__ ( self) :
self. __url = 'https://music.163.com/'
self. browser = webdriver. Chrome( )
def __get_gedans ( self, url= None ) :
gedans = [ ]
self. browser. get( self. __url)
self. browser. switch_to. frame( 'g_iframe' )
we_gedans = self. browser. find_elements_by_css_selector( '[class = "u-cover u-cover-1"]' )
for we_gedan in we_gedans:
gedan = { }
gedan = {
'title' : we_gedan. find_element_by_css_selector( 'a' ) . get_attribute( 'title' ) ,
'url' : we_gedan. find_element_by_css_selector( 'a' ) . get_attribute( 'href' ) ,
'image' : we_gedan. find_element_by_css_selector( 'img' ) . get_attribute( 'src' ) ,
'click' : we_gedan. find_element_by_css_selector( '.nb' ) . text
}
gedans. append( gedan)
return gedans
def __get_songs ( self, gedan) :
urls = [ ]
self. browser. get( gedan[ 'url' ] )
self. browser. switch_to. frame( 'g_iframe' )
we_songs = self. browser. find_elements_by_css_selector( '[class = "even "]' )
for we_song in we_songs:
url = {
'id' : we_song. find_element_by_css_selector( '.left .hd span' ) . get_attribute( 'data-res-id' ) ,
'name' : we_song. find_element_by_css_selector( '.f-cb b' ) . get_attribute( 'title' ) ,
'songer' : we_song. find_elements_by_css_selector( '.text' ) [ 0 ] . get_attribute( 'title' ) ,
'cd' : we_song. find_elements_by_css_selector( '.text' ) [ 1 ] . find_element_by_css_selector( 'a' ) . get_attribute( 'title' )
}
for key, value in url. items( ) :
url[ key] = ' ' . join( value. split( ) )
urls. append( url)
return urls
def __download_song ( self, song) :
base_url = "http://music.163.com/song/media/outer/url?id={0}"
url = base_url. format ( song[ 'id' ] )
re = requests. get( url)
if re. status_code == 200 :
filename = song[ 'name' ] + '.mp3'
with open ( filename, 'wb' ) as f:
f. write( re. content)
at_song = eyed3. load( filename)
at_song. tag. artist = song[ 'songer' ]
at_song. tag. album = song[ 'cd' ]
at_song. tag. title = song[ 'name' ]
at_song. tag. save( )
return
def __save_json ( self, strs, filename) :
filename = filename if os. path. splitext( filename) [ 1 ] == '.json' else os. path. splitext( filename) [ 0 ] + '.json'
with open ( filename, 'w' , encoding= 'utf-8' ) as f:
f. write( json. dumps( strs, indent= 4 , ensure_ascii= False ) )
def __read_json ( self, filename) :
data = [ ]
filename = filename if os. path. splitext( filename) [ 1 ] == '.json' else os. path. splitext( filename) [ 0 ] + '.json'
with open ( filename, 'r' , encoding= 'utf-8' ) as f:
data = json. loads( f. read( ) )
return data
def run ( self) :
gedans = self. __get_gedans( )
self. __save_json( gedans, 'gedans.json' )
for gedan in gedans:
try :
songs = self. __get_songs( gedan)
self. __save_json( songs, 'songs.json' )
except Exception as e:
print ( e. args)
os. chdir( 'songs' )
for song in songs:
try :
self. __download_song( song)
except Exception as e:
print ( e. args)
os. chdir( '..' )
self. browser. close( )
if __name__ == '__main__' :
splider = Splider( )
splider. run( )