Sesame HTTP: Python crawler combat to grab Taobao MM photos

The goal of this article

1. Grab Taobao MM's name, avatar, age

2. Grab the profile and photo pictures of each MM

3. Save the photo pictures of each MM to the local according to the folder

4. Familiar with the process of file saving

1. URL format

The URL we use here is http://mm.taobao.com/json/request_top_list.htm?page=1 , the base address is in front of the question mark, and the parameter page behind is the page number, and the address can be changed at will. After clicking on it, you will find some profiles of Taobao MM with hyperlinks to the personal details page.

We need to grab the avatar address of this page, MM's name, MM's age, MM's residence, and MM's personal details page address.

2. Grab brief information

I believe that after several actual battles, you are already very familiar with grabbing and extracting the address of the page. There is no difficulty here. We first grab the MM details page address, name, age and other information of this page and print it out. Paste the code directly as follows


__author__ = 'CQC'
# -*- coding:utf-8 -*-

import urllib
import urllib2
import re

class Spider:

    def __init__(self):
        self.siteURL = 'http://mm.taobao.com/json/request_top_list.htm'

    def getPage(self,pageIndex):
        url = self.siteURL + "?page=" + str(pageIndex)
        print url
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        return response.read().decode('gbk')

    def getContents(self,pageIndex):
        page = self.getPage(pageIndex)
        pattern = re.compile('<div class="list-item".*?pic-word.*?<a href="(.*?)".*?<img src="(.*?)".*?<a class="lady-name.*?>(.*?)</a>.*?<strong>(.*?)</strong>.*?<span>(.*?)</span>',re.S)
        items = re.findall(pattern,page)
        for item in items:
            print item[0],item[1],item[2],item[3],item[4]

spider = Spider()
spider.getContents(1)

The running result is as follows

QQ screenshot 20150220234132

2. Introduction to file writing

Here, we have two ways to write pictures and write text

1) Write the picture



#Pass in the image address, file name, save a single image
def saveImg(self,imageURL,fileName):
     u = urllib.urlopen (imageURL)
     data = u.read()
     f = open(fileName, 'wb')
     f.write(data)
     f.close()

2) write text



def saveBrief(self,content,name):
    fileName = name + "/" + name + ".txt"
    f = open(fileName,"w+")
    print u"secretly saving her personal information as",fileName
    f.write(content.encode('utf-8'))

3) Create a new directory



#create new directory
def mkdir(self,path):
    path = path.strip()
    # Check if the path exists
    # exists True
    # does not exist False
    isExists=os.path.exists(path)
    # critical result
    if not isExists:
        # Create the directory if it doesn't exist
        # Create a directory operation function
        os.makedirs(path)
        return True
    else:
        # If the directory exists, do not create it and prompt that the directory already exists
        return False

3. Code improvement

The main knowledge points have been covered above. If you have read the previous chapters, it is not a problem to complete this crawler. The specific details will not be repeated here, and the code will be directly posted.

spider.py



__author__ = 'CQC'
# -*- coding:utf-8 -*-
 
import urllib
import urllib2
import re
import tool
import them
 
#grab MM
class Spider:
 
    #Page initialization
    def __init__(self):
        self.siteURL = 'http://mm.taobao.com/json/request_top_list.htm'
        self.tool = tool.Tool()
 
    #Get the content of the index page
    def getPage(self,pageIndex):
        url = self.siteURL + "?page=" + str(pageIndex)
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        return response.read().decode('gbk')
 
    #Get the information of all MMs in the index interface, list format
    def getContents(self,pageIndex):
        page = self.getPage(pageIndex)
        pattern = re.compile('<div class="list-item".*?pic-word.*?<a href="(.*?)".*?<img src="(.*?)".*?<a class="lady-name.*?>(.*?)</a>.*?<strong>(.*?)</strong>.*?<span>(.*?)</span>',re.S)
        items = re.findall(pattern,page)
        contents = []
        for item in items:
            contents.append([item[0],item[1],item[2],item[3],item[4]])
        return contents
 
    #Get MM personal details page
    def getDetailPage(self,infoURL):
        response = urllib2.urlopen(infoURL)
        return response.read().decode('gbk')
 
    #Get personal text introduction
    def getBrief(self,page):
        pattern = re.compile('<div class="mm-aixiu-content".*?>(.*?)<!--',re.S)
        result = re.search(pattern,page)
        return self.tool.replace(result.group(1))
 
    #Get all the pictures on the page
    def getAllImg(self,page):
        pattern = re.compile('<div class="mm-aixiu-content".*?>(.*?)<!--',re.S)
        #All codes of personal information page
        content = re.search(pattern,page)
        # extract image from code
        patternImg = re.compile('<img.*?src="(.*?)"',re.S)
        images = re.findall(patternImg,content.group(1))
        return images
 
 
    #Save multiple photo images
    def saveImgs(self,images,name):
        number = 1
        print u"found",name,u"total",len(images),u"photos"
        for imageURL in images:
            splitPath = imageURL.split('.')
            fTail = splitPath.pop()
            if len(fTail) > 3:
                fTail = "jpg"
            fileName = name + "/" + str(number) + "." + fTail
            self.saveImg(imageURL,fileName)
            number += 1
 
    # save avatar
    def saveIcon(self,iconURL,name):
        splitPath = iconURL.split ('.')
        fTail = splitPath.pop()
        fileName = name + "/icon." + fTail
        self.saveImg(iconURL,fileName)
 
    #save profile
    def saveBrief(self,content,name):
        fileName = name + "/" + name + ".txt"
        f = open(fileName,"w+")
        print u"secretly saving her personal information as",fileName
        f.write(content.encode('utf-8'))
 
 
    #Pass in the image address, file name, save a single image
    def saveImg(self,imageURL,fileName):
         u = urllib.urlopen (imageURL)
         data = u.read()
         f = open(fileName, 'wb')
         f.write(data)
         print u"is quietly saving a picture of her as",fileName
         f.close()
 
    #create new directory
    def mkdir(self,path):
        path = path.strip()
        # Check if the path exists
        # exists True
        # does not exist False
        isExists=os.path.exists(path)
        # critical result
        if not isExists:
            # Create the directory if it doesn't exist
            print u"Secretly created a new folder named ",path,u'"
            # Create a directory operation function
            os.makedirs(path)
            return True
        else:
            # If the directory exists, do not create it and prompt that the directory already exists
            print u"The folder named ",path,' has been created successfully'
            return False
 
    #Save a page of Taobao MM information
    def savePageInfo(self,pageIndex):
        #Get the first page of Taobao MM list
        contents = self.getContents(pageIndex)
        for item in contents:
            #item[0] personal details URL, item[1] avatar URL, item[2] name, item[3] age, item[4] residence
            print u"Found a model whose name is",item[2],u"Fangling",item[3],u",she is in",item[4]
            print u "is secretly saving the information of ",item[2],""
            print u"Accidentally found her personal address is",item[0]
            #URL of personal details page
            detailURL = item[0]
            #Get personal details page code
            detailPage = self.getDetailPage(detailURL)
            #get profile
            brief = self.getBrief(detailPage)
            #Get a list of all pictures
            images = self.getAllImg(detailPage)
            self.mkdir(item[2])
            #save profile
            self.saveBrief(brief,item[2])
            #save avatar
            self.saveIcon(item[1],item[2])
            #save Picture
            self.saveImgs(images,item[2])
 
    #Pass in the starting and ending page numbers to get MM pictures
    def savePagesInfo(self,start,end):
        for i in range(start,end+1):
            print u"I'm secretly looking for the ",i,u"th place, see if the MMs are there"
            self.savePageInfo(i)
 
 
#You can pass in the starting and ending page numbers. Here, 2,10 are passed in, which means to grab the MM of the 2nd to 10th pages
spider = Spider()
spider.savePagesInfo(2,10)



__author__ = 'CQC'
#-*- coding:utf-8 -*-
import re
 
#Handle page tag class
class Tool:
    #Remove img tags, 1-7 spaces, 
    removeImg = re.compile('<img.*?>| {1,7}| ')
    #remove hyperlink tags
    removeAddr = re.compile('<a.*?>|</a>')
    #Replace the newline label with \n
    replaceLine = re.compile('<tr>|<div>|</div>|</p>')
    #replace table tab <td> with \t
    replaceTD= re.compile('<td>')
    #replace newlines or double newlines with \n
    replaceBR = re.compile('<br><br>|<br>')
    # remove the remaining tags
    removeExtraTag = re.compile('<.*?>')
    # delete multiple blank lines
    removeNoneLine = re.compile('\n+')
    def replace(self,x):
        x = re.sub(self.removeImg,"",x)
        x = re.sub(self.removeAddr,"",x)
        x = re.sub(self.replaceLine,"\n",x)
        x = re.sub(self.replaceTD,"\t",x)
        x = re.sub(self.replaceBR,"\n",x)
        x = re.sub(self.removeExtraTag,"",x)
        x = re.sub(self.removeNoneLine,"\n",x)
        #strip() removes redundant content before and after
        return x.strip()

The above two files are all the code content, run it and try it out, it is sour.

See what's changed in the folder

Before you know it, a large number of MM pictures have entered your computer, so hurry up and try it! !