Simple crawler, crawl the entire page, modify the url in the code to crawl the specified website.
import urllib.request # 导入包
def getHtml(url): # 获取html的内容
html = urllib.request.urlopen(url).read() # bytes 如果不用read()html会是一个↓
return html # http.client.HTTPResponse的变量
def saveHtml(fileName, fileContent):
with open(fileName, "wb") as f: # 以wb打开文件
f.write(fileContent) # 写入
def main():
url = "https://www.zhihuishu.com/" # url
html = getHtml(url) # 调用函数获取bytes
saveHtml("theHtml.html", html) # 保存
print("保存网页完成") # 提示语
if __name__ == "__main__": # 主函数
main()
Remaining problem: Many websites have anti-crawler mechanisms, which cause the crawled files to be not the desired files.