爬虫 - 代码天地

 
   #-*- coding:utf-8 -*- 
  
   import urllib, urllib2 
  
   import time 
  
   from fake_useragent 
   import UserAgent 
  
   import requests 
  
   import threading 
  
   from bs4 
   import BeautifulSoup 
  
   def 
   get_page_source( 
   url): 
  
    headers = { 
   'User-Agent': UserAgent().random} 
  
    req = urllib2.Request(url, 
   None, 
   headers=headers) 
  
    response = urllib2.urlopen(req) 
  
    page_source = response.read() 
  
   return page_source 
  
    count= 
   0 
  
    filename= 
   'IpPool.txt' 
  
   for i 
   in 
   range( 
   1, 
   3): 
  
    url= 
   "http://www.xicidaili.com/nn/ 
   %s 
   "%i 
  
    html=get_page_source(url) 
  
    soup=BeautifulSoup(html, 
   'html5lib') 
  
   list = [] 
  
   for idx, tr 
   in 
   enumerate(soup.find_all( 
   'tr')): 
  
   if idx != 
   0: 
  
    tds = tr.find_all( 
   'td') 
  
    ip=tds[ 
   1].contents[ 
   0] 
  
    port=tds[ 
   2].contents[ 
   0] 
  
    ipt=ip+ 
   ':'+port 
  
   list.append(ipt) 
  
    lock=threading.Lock() 
   #建立一个锁 
  
   def 
   test( 
   i): 
  
   global lock,count 
  
    lock.acquire() 
  
    count=count+ 
   1 
  
    proxy={ 
   'http':i} 
  
    url= 
   "https://www.baidu.com" 
  
    resp=requests.get(url, 
   proxies=proxy) 
  
   if resp.status_code== 
   200: 
  
   # print(i), #python2不换行 
  
   with 
   open(filename, 
   'a') 
   as f: 
  
    f.write(i+ 
   ' 
   \n 
   ') 
  
    f.close() 
  
   print( 
   " 
   %s 
   :  
   %s 
     
   %s 
   "%(count,i,resp.status_code)) 
  
    lock.release() 
  
    threads=[] 
  
   for i 
   in 
   list: 
  
    thread=threading.Thread( 
   target=test, 
   args=(i,)) 
  
    threads.append(thread) 
  
    thread.start() 
  
   #阻塞主进程，等待所有子线程结束 
  
   for thread 
   in threads: 
  
    thread.join()

 
   import requests 
  
   from fake_useragent 
   import UserAgent 
  
    ua = UserAgent() 
  
    headers = { 
   'User-Agent': ua.random} 
  
    url = 
   'http://www.xicidaili.com/nn/1' 
  
    resp = requests.get(url, 
   headers=headers) 
  
   print(resp.text) 
  
   print(resp.encoding) 
   #获得编码 
  
   print(resp.text.encode( 
   'utf-8')) 
   #去编码 
  
   print(resp.status_code) 
   #返回表示码

爬虫

猜你喜欢