Today, I need to crawl some bilingual materials temporarily
(not yet cleaned)
need to take full advantage of
The code below is to get the link of each bilingual news in the Chinadaily webpage. First, study the URL and webpage structure of these webpages, including page turning, which is usually the homepage URL plus _2, _3... and so on. So the following code just gets the link.
#!/usr/bin/env python # -*- coding: utf-8 -*- """ File: bi_news.py Author: ZhangHaiou([email protected]) Date: 2018/05/04 """ import urllib import re import os bi_urls = [] def getHtml(url): #Read web page content page = urllib.urlopen(url) html = page.readlines() #print html return html def getImg(html): reg = r'src="(.+?\.jpg)" pic_ext' imgre = re.compile(reg) imglist = re.findall(imgre,html) x = 0 for imgurl in imglist: urllib.urlretrieve(imgurl,'%s.jpg' % x) x+=1 def geturl(html): #Read the link needed in the web page for line in html: if re.search( ' \<div class="mr10"\>\<a href="\d\d\d\d\ -\d\d/\d\d/content\_\d{4,}.htm" ' ,line): if re.search( ' \<div class="mr10"\>\<a href=" 2016\-\d\d/\d\d/content\_\d{4,}.htm" ' ,line): #Just want to get the corpus after 2016 os._exit(0) else : url = re.findall(r'\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm',line) print("http://language.chinadaily.com.cn/" + url[0]) bi_urls.append("http://language.chinadaily.com.cn/" + url[0]) if __name__ == '__main__': n = 1 # os.system('wget -r --spider http://language.chinadaily.com.cn/news_bilingual.html') # #geturl(getHtml("http://language.chinadaily.com.cn/news_bilingual.html")) # ''' while n: if(n < 2): html = getHtml("http://language.chinadaily.com.cn/columnist/columnist.html") elif(n > 1): html = getHtml("http://language.chinadaily.com.cn/columnist/columnist.html_" + str(n) + ".html" ) geturl(html) n = n + 1
Execute python bi_news.py >url.txt to save the desired URL
url.txt content:
The next step is to simply crawl the web page content linked to each line in the url, and organize the news into folders by month. The file name is the last eight digits of each news link.
#!/usr/bin/env python # -*- coding: utf-8 -*- """ File: content.py Author: ZhangHaiou([email protected]) Date: 2018/05/04 """ import urllib import re import os import sys bi_urls = [] def getHtml(url): page = urllib.urlopen(url) html = page.read() #print html return html def getImg(html): reg = r'src="(.+?\.jpg)" pic_ext' imgre = re.compile(reg) imglist = re.findall(imgre,html) x = 0 for imgurl in imglist: urllib.urlretrieve(imgurl,'%s.jpg' % x) x+=1 def geturl(html): for line in html: if re.search('\<div class="mr10"\>\<a href="\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm"',line): if re.search('\<div class="mr10"\>\<a href="2016\-\d\d/\d\d/content\_\d{4,}.htm"',line): os._exit(0) else: url = re.findall(r'\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm',line) print(url) bi_urls.append(url) def savefile(savepath, content): with open(savepath, "w") as fp: fp.write(content) if __name__ == '__main__': for line in open(sys.argv[1],'r'): content = "" n = 1 while n: #This loop is to not miss the news that needs to be turned if n > 1 : htm = line + "_" + str(n) else: htm = line raw = getHtml(htm) if not re.findall(r'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',raw): #避免空白页 break print(htm) n = n + 1 # for hang in raw: # if re.search('^\<p\>.*\<\/p\>',hang): content = content + raw date = re.findall(r'\d\d\d\d\-\d\d',line)[0] filename = re.findall(r ' \d{6,} ' ,line)[0] if not os.path.exists(date): #Whether there is a directory os.makedirs(date) savefile(date + "/" + filename + ".txt" , content)