爬学校就业指导网站和学校有合作的企业————第一次用爬虫

   已经将数据库操作注释掉了

 1 from urllib.request import urlopen;
 2 from urllib.error import HTTPError;
 3 from bs4 import BeautifulSoup;
 4 import pymysql;
 5 
 6 num = 1480;
 7 #conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='mysql',charset='utf8');
 8 #cur = conn.cursor();
 9 #cur.execute("USE hfutxjh");
10 while num < 2000:
11     url = "http://gdjy.hfut.edu.cn/products/" + str(num) + ".html";
12     num = num + 1;
13     try:
14         html = urlopen(url);
15     except HTTPError as e:
16         print(num - 1);
17         print(e);
18         continue;
19     else:
20         if html is None:
21             print(num - 1);
22             print("URL is not found");
23         else:
24             bsObj = BeautifulSoup(html.read());
25             str1=str(num-1);
26             print(str1);
27             str2=bsObj.h3.get_text();
28             print(str2);
29             #cur.execute("INSERT INTO h3(id,h3text) VALUES('"+str1+"','"+str2+"')");
30             #print(cur.fetchone());
31 
32 #cur.close();
33 #conn.close();

猜你喜欢

转载自www.cnblogs.com/uasier/p/9259066.html