python + requests to fetch the cat's eye in the movie was released, re regular match ranking acquire the corresponding movie, picture address, title, and starring showtimes and ratings
import requests import re, json def get_html(url): """ Gets the page html source code :return: """ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" # 浏览器信息 headers = { "User-Agent": user_agent } r = requests.get(url, headers=headers) html = r.text # print(html) return html def parse_one_page(html): """ Regular content needs to match :param html: :return: "" " # Rank + pictures + address starring + + release time score pattern = re.compile ( ' <dd>. *? Board-index. *?> (\ D +) </ i>. *? The Data-src = "(. *?)." *? name "> <a '+' .*?> (. *?) </a>. *? Star"> (. *?) </ P>. *? releasetime "> (. *?) </ P> ' + ' . *? Integer"> (. *?) </ I>. *? fraction "> (. *?) </ I>. *? </ dd > ' , re.S) items = re.findall(pattern, html) for Item in items: yield { " rank " : Item [0], " Image Address " : Item [1 ], " title " : Item [2 ], " starring " : Item [3] .strip () [3 :], " release time " : Item [. 4] .strip () [. 4 :], " score " : Item [. 5] Item + [. 6 ] } # Data storage def write_file(content): with open("result.txt", 'a+', encoding='utf-8') as f: f.write(json.dumps(content, ensure_ascii=False) + "\n") def main(): """ The main function :return: """ url = "http://maoyan.com/board/4" html = get_html(url) for item in parse_one_page(html): print(item) write_file(item) if __name__ == '__main__': main()