# http://music.163.com/discover/playlist/?order=hot&cat=%E5%85%A8%E9%83%A8&limit=35&offset=35 # http://music.163.com/discover/playlist/?order=hot&cat=%E5%85%A8%E9%83%A8&limit=35&offset=70 from urllib import request,parse import pymysql from bs4 import BeautifulSoup import re import random from selenium import webdriver import time import json import requests def singer(): base_url = 'http://music.163.com/discover/artist/cat?id=1001' user_agent = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0', "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36", ] ua = random.choice(user_agent) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', # Accept-Encoding:gzip, deflate, sdch, 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': '_ntes_nnid=9590bff1d254dd7fa3f25d8e0d311522,1510122577046; _ntes_nuid=9590bff1d254dd7fa3f25d8e0d311522; _ngd_tid=dGE34taznjKAcDgyTJtO2J7d%2Bg%2BOLdSZ; usertrack=ezq0pVqnqQJAj7faBRRpAg==; __f_=1521533778639; starttime=; NTES_SESS=gWckWp0kS7P8.9Ll7xa1C2UB_4KpPrwEUUnWVSgGwgUuP_OdPs6CJlnou4rhyTwLPIJVjVNPLy3yX18e7HET2ih10YmTlRyC72K7.chKLFGewToNDdDACY3ojifgYw5TipjDIF7JEcSBMG6jhgsdk4TJayFaVg0m3mSciKsZf0JgHiZjNW9Urz_X2s8tcyGw9.DPBx6s5eROyccVAQqxWLj.v33_K253y; S_INFO=1523348346|0|3&80##|m13349949963_1; [email protected]|1523348346|0|mail163|00&99|null&null&null#jix&360100#10#0#0|133963&1||[email protected]; [email protected]:-1:1; df=mail163_letter; Province=0790; City=0791;MUSIC_EMAIL_U=8e19f5c8cbc11303a2d71c0d3532255599dcb9d14e06692f6202bc9e762f9363b95b9657afa5636f8b99162d656ec71eead030040e2add0d4bcf6e8189568a96; playliststatus=visible; JSESSIONID-WYYY=dwTSf7No9xGH7HzrhqYcwnPQIVAnwgM6Pq%2FO%5ClmDiH2l5ScrkuvMSG%2BYZutH6wAz9WPwmNoo2evEm9Ee%2B%2Fa3%5Cx%5C%2B%2FUoToq37TQd%2BkzRzkSimgZlpbqnQXVP%5Cdu86phA4Se0w%2FQpgg15A8%2FjES3ahRByglGzMjzuiSSE8DRk%2B9ojksu33%3A1523358787887; _iuqxldmzr_=32; __utma=94650624.199382336.1520261898.1523349418.1523355839.5; __utmb=94650624.21.10.1523355839; __utmc=94650624; __utmz=94650624.1523349418.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic',JSESSIONID-WYYY=dwTSf7No9xGH7HzrhqYcwnPQIVAnwgM6Pq%2FO%5ClmDiH2l5ScrkuvMSG%2BYZutH6wAz9WPwmNoo2evEm9Ee%2B%2Fa3%5Cx%5C%2B%2FUoToq37TQd%2BkzRzkSimgZlpbqnQXVP%5Cdu86phA4Se0w%2FQpgg15A8%2FjES3ahRByglGzMjzuiSSE8DRk%2B9ojksu33%3A1523358787887; _iuqxldmzr_=32; __utma=94650624.199382336.1520261898.1523349418.1523355839.5; __utmb=94650624.21.10.1523355839; __utmc=94650624; __utmz=94650624.1523349418.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic',JSESSIONID-WYYY=dwTSf7No9xGH7HzrhqYcwnPQIVAnwgM6Pq%2FO%5ClmDiH2l5ScrkuvMSG%2BYZutH6wAz9WPwmNoo2evEm9Ee%2B%2Fa3%5Cx%5C%2B%2FUoToq37TQd%2BkzRzkSimgZlpbqnQXVP%5Cdu86phA4Se0w%2FQpgg15A8%2FjES3ahRByglGzMjzuiSSE8DRk%2B9ojksu33%3A1523358787887; _iuqxldmzr_=32; __utma=94650624.199382336.1520261898.1523349418.1523355839.5; __utmb=94650624.21.10.1523355839; __utmc=94650624; __utmz=94650624.1523349418.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic', 'Host': 'music.163.com', 'Referer': 'http://music.163.com/', 'Upgrade-Insecure-Requests': 1, 'User-Agent': ua } req = request.Request(base_url, headers=headers) response = request.urlopen(req) html = response.read() html = html.decode('utf-8') soup = BeautifulSoup(html, 'lxml') singers= soup.select(".m-sgerlist")[0] lis = singers.find_all(['li']) #take the singer's name for li in lis: #singer name singer = li.select('a[class="nm nm-icn f-thide s-fc0"]')[0].text href = li.select('a[class="nm nm-icn f-thide s-fc0"]')[0].attrs['href'] # print(singer,type(singer)) pattern = re.compile('[0-9]+') id = pattern.findall(href)[0] num = int(id) # print(id,type(id)) conn = pymysql.connect('127.0.0.1','root','zb376100870','163_song',charset='utf8') cursor = conn.cursor() sql = "insert into singer_tb(sger_name,sger_num) VALUE (%s,%s)" data = (singer,num) cursor.execute(sql,data) conn.commit() #get album album(id,headers) def album(id,headers): # if id == '6452': sger_num = int (id) for i in range(10): j = i*12 sger_url = 'http://music.163.com/artist/album?id=%d&limit=12&offset=%d'%(sger_num,j) req = request.Request(sger_url,headers=headers) response = request.urlopen(req) html = response.read() html = html.decode('utf-8') soup = BeautifulSoup(html,'lxml') albums = soup.select('#m-song-module') # print('123456'+html) if albums != []: lis = albums[0].find_all(['li']) # print(lis) for li in lis: album_name = li.select('a[class="tit s-fc0"]')[0].text href = li.select('a[class="tit s-fc0"]')[0].attrs['href'] pattern = re.compile('[0-9]+') album_id = pattern.findall(href)[0] album_num = int(album_id) # print(album_name) conn = pymysql.connect('127.0.0.1', 'root', 'zb376100870', '163_song', charset='utf8') cursor = conn.cursor() sql = "insert into album_tb(album_name,album_num,sger_num) VALUE (%s,%s,%s)" data = (album_name, album_num,sger_num) cursor.execute(sql, data) conn.commit() song(album_num,headers,sger_num) def song(album_num,headers,sger_num): # print(album_num,type(album_num)) song_url = 'http://music.163.com/album?id=%d'%(album_num) req = request.Request(song_url, headers=headers) response = request.urlopen(req) html = response.read() html = html.decode('utf-8') soup = BeautifulSoup(html, 'lxml') # albums = soup.select('#m-song-module') # print(html) album_song = soup.select('ul[class="f-hide"]')[0] songs = album_song.find_all(['a']) for song in songs: song_name = song.text href = song.attrs['href'] pattern = re.compile('[0-9]+') song_id = pattern.findall(href)[0] song_num = int(song_id) print(song_name,song_num,album_num,sger_num) # conn = pymysql.connect('127.0.0.1', 'root', 'zb376100870', '163_song', charset='utf8') # cursor = conn.cursor() # sql = "insert into song_tb(song_name,song_num,album_num,sger_num) VALUE (%s,%s,%s,%s)" # data = (song_name, song_num,album_num, sger_num) # cursor.execute(sql, data) # conn.commit() word_cmts(song_name, song_num,album_num, sger_num,headers) # word_cmts(song_num,headers) def word_cmts(song_name, song_num,album_num, sger_num,headers): print(song_num) word_url = 'http://music.163.com/api/song/lyric?' + 'id=' + str(song_num) + '&lv=1&kv=1&tv=-1' # cmts_url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_185807?id=' + str(song_num) + '&lv=1&kv=1&tv=-1' # cmts_url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_185807?id=531051217&lv=1&kv=1&tv=-1' req = request.Request(word_url, headers=headers) response = request.urlopen(req) html = response.read() html = html.decode('utf-8') j = json.loads(html) print('~~~~~~~~~~~',j) if 'lrc' in j: lrc = j['lrc']['lyric'] # print(lrc) pat = re.compile(r'\[.*\]') lrc = re.sub(pat,"",lrc) lrc = lrc.strip() else: lrc = '' # print(lrc,type(lrc)) cmts_url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_' + str(song_num) + '?id=' + str(song_num) + '&lv=1&kv=1&tv=-1' # cmts_url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_185807?id=531051217&lv=1&kv=1&tv=-1' req = request.Request(cmts_url,headers=headers) response = request.urlopen(req) html = response.read() html = html.decode('utf-8') c = json.loads(html) cmts = c['hotComments'] lst = [] for cmt in cmts: comment = cmt['content'] comment = comment.strip() # print(comment) # patttern = re.compile(r'[a-zA-Z0-9\w+]+') # comment = re.sub(patttern,'',comment) # comment = comment.strip() # print(comment) lst.append(comment) cmmt = str(lst) # print(cmmt,type(cmmt)) try: # How python UCS-4 build is handled highpoints = re.compile(u'[\U00010000-\U0010ffff]') except re.error: # How python UCS-2 build is handled highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') cmmt = highpoints.sub(u'??', cmmt) # print(cmmt) conn = pymysql.connect('127.0.0.1', 'root', 'zb376100870', '163_song', charset='utf8') cursor = conn.cursor() sql = "insert into song_tb(song_name,song_num,album_num,sger_num,song_lrc,song_cmts) VALUE (%s,%s,%s,%s,%s,%s)" data = (song_name, song_num, album_num, sger_num,lrc,cmmt) cursor.execute(sql, data) conn.commit() if __name__ == '__main__': singer()
Field Description
Singer_tb is a singer table, including the following fields:
Sger_id: for counting and sorting
Sger_name: singer name
Sger_num: The number corresponding to the singer's name, which can be used for multi-table joint query
Number of singers: 6
Album_tb: is the album table, including the fields:
Albun_id: for counting and sorting
Album_name: Album name
Album_num: album number, which can be used for multi-table joint query with the song_tb song table
Sger_num: The number corresponding to the singer's name, which can be used for multi-table joint query with the singer_tb singer table,
Number of albums: 223
Song_tb: is the song table, including the fields:
Song_id: for counting and sorting
Song_name: Song name
Song_num: Song number
Album_num: album number, which can be used for multi-table joint query with the song_tb song table
Sger_num: The number corresponding to the singer's name, which can be used for multi-table joint query with the singer_tb singer table,
Song_lrc: Lyrics
Song_cmts: Song reviews
Number of songs: 2286
Simple description of multi-table join query
The correspondence between each table and the table has been matched, which can be realized by multi-table joint query (the specific operation can be realized in the database),
Looking at Jay Chou's profile, you can see all Jay Chou's songs and albums, which can be searched with multiple tables.
For example: enter select song_name,sger_name from song_tb,singer_tb in the database terminal
Where song_tb.sger_num = singer_tb.sger_num; is just an example of a multi-table join query
Problems encountered:
- The interface problem of lyrics, because the js encryption method is more complicated, what I found is the api interface, which can directly get the lyrics
- The problem of inserting comments into the database is that the emoji expression cannot be inserted, and the expression of the comment needs to be removed.
The reason is the encoding of the database. The default encoding of the database is utf8. Generally, one character occupies three bytes, but the emoji expression occupies 4 bytes, which leads to the situation that cannot be inserted. The regular method is used to remove the emoji expression.