爬虫

#-*- coding:utf-8 -*-
import urllib, urllib2
import time
from fake_useragent import UserAgent
import requests
import threading
from bs4 import BeautifulSoup
def get_page_source( url):
headers = { 'User-Agent': UserAgent().random}
req = urllib2.Request(url, None, headers=headers)
response = urllib2.urlopen(req)
page_source = response.read()
return page_source
count= 0
filename= 'IpPool.txt'
for i in range( 1, 3):
url= "http://www.xicidaili.com/nn/ %s "%i
html=get_page_source(url)
soup=BeautifulSoup(html, 'html5lib')
list = []
for idx, tr in enumerate(soup.find_all( 'tr')):
if idx != 0:
tds = tr.find_all( 'td')
ip=tds[ 1].contents[ 0]
port=tds[ 2].contents[ 0]
ipt=ip+ ':'+port
list.append(ipt)
lock=threading.Lock() #建立一个锁
def test( i):
global lock,count
lock.acquire()
count=count+ 1
proxy={ 'http':i}
url= "https://www.baidu.com"
resp=requests.get(url, proxies=proxy)
if resp.status_code== 200:
# print(i), #python2不换行
with open(filename, 'a') as f:
f.write(i+ ' \n ')
f.close()
print( " %s : %s %s "%(count,i,resp.status_code))
lock.release()
threads=[]
for i in list:
thread=threading.Thread( target=test, args=(i,))
threads.append(thread)
thread.start()
#阻塞主进程,等待所有子线程结束
for thread in threads:
thread.join()



import requests
from fake_useragent import UserAgent
ua = UserAgent()
headers = { 'User-Agent': ua.random}
url = 'http://www.xicidaili.com/nn/1'
resp = requests.get(url, headers=headers)
print(resp.text)
print(resp.encoding) #获得编码
print(resp.text.encode( 'utf-8')) #去编码
print(resp.status_code) #返回表示码

猜你喜欢

转载自blog.csdn.net/douzhenwen/article/details/80199549