子域名搜集

#爬虫(使用搜索引擎|如baidu)
#coding=utf-8
import re
import requests
sites = []
for i in range(0,10):   #10页为例
    i = i*10
    url = 'https://www.baidu.com/s?wd=site:xxx.com.cn&pn=%s' %i     #设定url请求
    response = requests.get(url).content   #get请求,content获取返回包正文
    baidudomain = re.findall('style="text-decoration:none;">(.*?)/',response)
    sites += list(baidudomain)
site = list(set(sites))  #set()实现去重
print site
print "\nThe number of sites is %d" %len(site)
for i in site:
    print i

#使用Python模块
from __future__ import division
import lxml
from lxml.html import fromstring
import requests
import re
import mechanize
import operator
import sys
import os
from time import sleep

class SameFileError(Exception): pass
class NoneTypeError(Exception): pass


global formlist
reqlist = []
feature_hub = []
_Root = os.getcwd()

def _requestData(url):
headers = {
'Connection': 'keep-alive',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Accept-Encoding': 'gzip, deflate',
# 'Cookie': '',
'Upgrade-Insecure-Requests': '1'
}
try:
req = requests.get(url, headers=headers,timeout=5)
except:
return 'err ', url, None
return req.status_code, url, req.text


def getLinks(self):
try:
resType, resHost, resData = _requestData(self)
if not resData:
raise NoneTypeError
doc = lxml.html.document_fromstring(resData)
tags = ['a', 'iframe', 'frame']
doc.make_links_absolute(resHost)
except Exception,NoneTypeError:
return resHost, None
links = doc.iterlinks()
trueLinks = []
for l in links:
if l[0].tag in tags:
trueLinks.append(l[2])
return trueLinks, resData # 要确保是绝对路径


def correct_url(url):
if 'http://' not in url:
url = 'http://' + url.strip()
return url


def middle_name(url):
# middle_name = re.findall(r'[\/\.]([\s\S]+)\.', url)
# tidy the url
url_tidy = url.strip('www.')
url_tidy = url_tidy.strip('http://')
url_tidy = url_tidy.strip('https://')
# dot = re.findall('\.', url_tidy)
re_url = re.compile(r'([-\w]+).')
try:
middle = re_url.match(url_tidy).groups(0)
except Exception:
return None
return middle[0]

def getdiffer(list1,list2):
if len(list1)<len(list2):
length = len(list1)
if (len(list2)-len(list1))>5:
return False
else:
length = len(list2)
if (len(list1)-len(list2))>5:
return False
return length

def str_compare(str1,str2,accuracy=0.80):
list1 = list(str1)
list2 = list(str2)
score = 0
# print "comparing:",str1,str2
total = len(list1)
length = getdiffer(list1,list2)
if length is False:return False
for i in xrange(length):
if list1[i] == list2[i]:
score += 1
ratio = score/total
if ratio > accuracy:
# print "similier"
return True
return False


def feature_match(link):
global url_old
for link_old in url_old:
if str_compare(link_old,link):
return True
return False


def feature_catch(link):
pass


def feature_filter(link):
# 检测是否匹配已有特征
if feature_match(link):
return True
return False


def single_url(url):
# 获取单一url入口
# try:
global url_ad
global url_old
global middle
url = correct_url(url)
# 获取页面上链接、数据
url_links, data = getLinks(url)
if data is None:
return
for link in url_links:
sys.stdout.write('!')
if link == url:
continue
if link in url + '/index':
continue
if 'javascript' in link:
continue
if link in url_old:
continue
if middle not in link:
continue
if feature_filter(link):
continue
try:
print "\n",link
except Exception:
pass
with open(_Root + "\\Results\\" + middle + "_links.txt","a") as f:

f.write(link+"\n")
# if link not in url_old and link not in url_add and 'http://www.xxx.com' in link:
# print link
# Findsubmit(link)
url_add.append(link)
url_old.append(link) # 因为已经加入到add,所以算是已知url,就加入old里。
# except Exception, e:
# print e
# pass


def Findsubmit(link):
global reqlist
try:
br = mechanize.Browser() # initiating the browser
br._factory.is_html = True
br.addheaders = [('User-agent',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
br.open(str(link), timeout=15)
if br.forms():
params = list(br.forms())
for par in params:
for p in par.controls:
ps = str(p)
# print p.name
if 'TextControl' in ps:
param = str(p.name)
reqstr = par.action + par.method + param
if reqstr not in reqlist:
reqlist.append(reqstr)
testxss(par.action, par.method, param)
except Exception, e:
print e
pass


def testxss(action, method, param):
method = method.lower()
headers = {'content-type': 'application/json',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'}
if method == "get":
print "=" * 10 + "get start" + "=" * 10
url = action + "/?" + param + "=test1234"
print url
# response = requests.get(url,headers=headers)
# print response.text
print "=" * 10 + "get end" + "=" * 10
if method == "post":
data = {'{0}'.format(param): "test"}
print "=" * 10 + "post start" + "=" * 10
print action
print data
# response = requests.post(action,data=data,headers=headers)
# print response.text
print "=" * 10 + "post end" + "=" * 10


def findlink(input,level=2):
global url_new
global url_old
global url_add
global middle
# 总入口
url_new = [] # level_i级的
url_old = [] # 所有已经爬过的
url_add = [] # 每个level_i级新增的链接
# url = 'http://www.xxx.com'
url = input
middle = middle_name(url)
url_new.append(url)
for level_i in xrange(level):
for i in xrange(len(url_new)):
url_new_i = url_new[i]
url_old.append(url_new_i)
sleep(0.5)
single_url(url_new_i)
url_new = url_add
# with open(middle + "_links.txt","w") as f:
# for line in url_old:
# f.write(line+"\n")


if __name__ == '__main__':
# try:
# url=sys.argv[1]
# except Exception:
# print "Usage: python findlinks.py www.example.com"
# exit()
# # url = 'http://www.xxx.com'
findlink('xx.com.cn',10)
 1 #字典方式
 2 #-*-coding:utf-8-*-
 3 
 4 import requests
 5 
 6 def verify(protocol,ip,port):
 7     def get_pass_dict():
 8         pass_dict = []
 9         with open('./dic.txt', 'r') as f:
10             for line in f.readlines():
11                 line = line.strip('\n')
12                 pass_dict.append(line)
13             f.close()
14         return pass_dict
15     dics = get_pass_dict()
16     for dic in dics:
17         url = protocol+'://'+ dic +'.'+ip+':'+str(port)
18         response = requests.get(url,verify=False,timeout=5,allow_redirects=False)
19         if response.status_code == 200:
20             print (url)
21 
22 if __name__ == '__main__':
23     res = verify('http','baidu.com','80')
24     print(res)

其他方式:

转载于:https://www.cnblogs.com/AtesetEnginner/p/11005126.html

猜你喜欢

转载自blog.csdn.net/weixin_34337265/article/details/93704408