1.Count email in a database
socket1.py
import socket
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect(('data.pr4e.org', 80))
cmd = 'GET http://data.pr4e.org/romeo.txt HTTP/1.0\r\n\r\n'.encode()
mysock.send(cmd)
while True:
data = mysock.recv(512)
if len(data) < 1:
break
print(data.decode(),end='')
mysock.close()
socket2.py
import socket
# This is using HTTP 1.0 - not all servers support the oldest protocol
# Try http://data.pr4e.org/romeo.txt if your server fails.
url = input('Enter: ')
words = url.split('/')
host = words[2]
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect((host, 80))
mysock.send(('GET '+url+' HTTP/1.0\r\n\r\n').encode())
while True:
data = mysock.recv(512)
if (len(data) < 1):
break
print(data.decode(), end='')
mysock.close()
2.Using Urllib and Beautiful Soap
bs4_1
# To run this, download the BeautifulSoup zip file
# http://www.py4e.com/code3/bs4.zip
# and unzip it in the same directory as this file
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
sum1 = 0
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter - ')
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")
# Retrieve all of the anchor tags
tags = soup('span')
#print(tags)
for tag in tags:
num = tag.contents[0]
#print(type(num))
intnum = int(num)
sum1 += intnum
print(sum1)
# Look at the parts of a tag
#print('TAG:', tag)
#print('URL:', tag.get('href', None))
#print('Contents:', tag.contents[0])
#print('Attrs:', tag.attrs)
'''
for tag in tags:
# Look at the parts of a tag
print('TAG:', tag)
print('URL:', tag.get('href', None))
print('Contents:', tag.contents[0])
print('Attrs:', tag.attrs)
'''
bs4_2
# To run this, download the BeautifulSoup zip file
# http://www.py4e.com/code3/bs4.zip
# and unzip it in the same directory as this file
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
namelist = list()
urlist = list()
i = 0
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
#第一次读取url
url = input('Enter - ')
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
#定义一个可供调用的,返回网址的函数
def Url_Read():
for tag in tags:
strurl = tag.get('href' , None)
urlist.append(strurl)
urlname = urlist[17]
return urlname
#功能主体
while i<7:
if i == 0:
passurl = Url_Read()
#print(passurl)
i += 1
elif (i == 1)|(i == 2)|(i == 3)|(i == 4)|(i == 5):
html = urllib.request.urlopen(passurl, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
urlist.clear() #清空列表,很重要!
passurl = Url_Read()
#print(passurl)
i += 1
elif i ==6:
html = urllib.request.urlopen(passurl, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
for tag in tags:
strname = tag.contents[0]
namelist.append(strname)
i += 1
print(namelist[17])
#print(Url_Read())
3.Extract data from XML
XML1
import urllib.request, urllib.parse, urllib.error
import xml.etree.ElementTree as ET
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
sumnum = 0
while True:
address = input('Enter the url: ')
if len(address) < 1:
break #这一块的作用是始终让url输入框出现,不是执行完一次就over了
xml = urllib.request.urlopen(address , context=ctx).read() #urllib库读取
#print(xml)
stuff = ET.fromstring(xml) #返回的stuff是一个tree
lst = stuff.findall('comments/comment') #这里找三层目录反而找不到
#print(len(lst))
for item in lst:
#print('Name:' , item.find('count').text)
num = int(item.find('count').text)
sumnum += num
print(sumnum)
4.Json
geojson1.py
import urllib.request, urllib.parse, urllib.error
import json
import ssl
#导入相关模块
api_key = False
# If you have a Google Places API key, enter it here
# api_key = 'AIzaSy___IDByT70'
# https://developers.google.com/maps/documentation/geocoding/intro
if api_key is False:
api_key = 42
serviceurl = 'http://py4e-data.dr-chuck.net/json?'
else :
serviceurl = 'https://maps.googleapis.com/maps/api/geocode/json?'
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
while True:
address = input('Enter location: ')
if len(address) < 1: break #实现连续交互的效果
parms = dict()
parms['address'] = address
if api_key is not False: parms['key'] = api_key
url = serviceurl + urllib.parse.urlencode(parms)
print('Retrieving', url)
uh = urllib.request.urlopen(url, context=ctx)
data = uh.read().decode()
print('Retrieved', len(data), 'characters')
try:
js = json.loads(data)
except:
js = None
if not js or 'status' not in js or js['status'] != 'OK':
print('==== Failure To Retrieve ====')
print(data)
continue
print(json.dumps(js, indent=4))
lat = js['results'][0]['geometry']['location']['lat']
lng = js['results'][0]['geometry']['location']['lng']
print('lat', lat, 'lng', lng)
location = js['results'][0]['formatted_address']
print(location)
geojson_test.py
import urllib.request, urllib.parse, urllib.error
import json
import ssl
#导入相关模块
api_key = False
# If you have a Google Places API key, enter it here
# api_key = 'AIzaSy___IDByT70'
# https://developers.google.com/maps/documentation/geocoding/intro
if api_key is False:
api_key = 42
serviceurl = 'http://py4e-data.dr-chuck.net/json?'
else :
serviceurl = 'https://maps.googleapis.com/maps/api/geocode/json?'
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
while True:
address = input('Enter location: ')
if len(address) < 1: break #实现连续交互的效果
parms = dict()
parms['address'] = address
if api_key is not False: parms['key'] = api_key
url = serviceurl + urllib.parse.urlencode(parms)
print('Retrieving', url)
uh = urllib.request.urlopen(url, context=ctx)
data = uh.read().decode()
print('Retrieved', len(data), 'characters')
try:
js = json.loads(data)
except:
js = None
if not js or 'status' not in js or js['status'] != 'OK':
print('==== Failure To Retrieve ====')
print(data)
continue
print(json.dumps(js, indent=4))
lat = js['results'][0]['geometry']['location']['lat']
lng = js['results'][0]['geometry']['location']['lng']
print('lat', lat, 'lng', lng)
location = js['results'][0]['formatted_address']
print(location)
5.Count email in database
count_organization.py
import sqlite3
conn = sqlite3.connect('orgdb.sqlite')
cur = conn.cursor()
cur.execute('DROP TABLE IF EXISTS Counts')
cur.execute('''
CREATE TABLE Counts (org TEXT , count INTEGER)''')
fname = input('Enter the file name:')
if (len(fname) < 1):
fname = 'mbox-short.txt'
fh = open(fname)
for line in fh:
if not line.startswith('From:'):
continue
cut1 = line.split()
email = cut1[1]
cut2 = email.split('@')
org = cut2[1]
cur.execute('SELECT count FROM Counts WHERE org = ?' , (org,))
row = cur.fetchone()
if row is None:
cur.execute('INSERT INTO Counts (org , count) VALUES (? , 1)' , (org,))
else:
cur.execute('UPDATE Counts SET count = count + 1 WHERE org = ?' , (org,))
conn.commit()
# https://www.sqlite.org/lang_select.html
sqlstr = 'SELECT org , count FROM Counts ORDER BY count DESC LIMIT 10'
for row in cur.execute(sqlstr):
print(str(row[0]) , row[1])
cur.close
6.Tracks-音乐软件
tracks.py
import xml.etree.ElementTree as ET
import sqlite3
conn = sqlite3.connect('trackdb.sqlite')
cur = conn.cursor()
# Make some fresh tables using executescript()
cur.executescript('''
DROP TABLE IF EXISTS Artist;
DROP TABLE IF EXISTS Genre;
DROP TABLE IF EXISTS Album;
DROP TABLE IF EXISTS Track;
CREATE TABLE Artist (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
name TEXT UNIQUE
);
CREATE TABLE Genre (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
name TEXT UNIQUE
);
CREATE TABLE Album (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
artist_id INTEGER,
title TEXT UNIQUE
);
CREATE TABLE Track (
id INTEGER NOT NULL PRIMARY KEY
AUTOINCREMENT UNIQUE,
title TEXT UNIQUE,
album_id INTEGER,
genre_id INTEGER,
len INTEGER, rating INTEGER, count INTEGER
);
''')
fname = input('Enter file name: ')
if ( len(fname) < 1 ) : fname = 'Library.xml'
# <key>Track ID</key><integer>369</integer>
# <key>Name</key><string>Another One Bites The Dust</string>
# <key>Artist</key><string>Queen</string>
def lookup(d, key):
found = False
for child in d:
if found : return child.text
if child.tag == 'key' and child.text == key :
found = True
return None
stuff = ET.parse(fname)
all = stuff.findall('dict/dict/dict')
print('Dict count:', len(all))
for entry in all:
if ( lookup(entry, 'Track ID') is None ) : continue
name = lookup(entry, 'Name')
artist = lookup(entry, 'Artist')
album = lookup(entry, 'Album')
genre = lookup(entry, 'Genre')
count = lookup(entry, 'Play Count')
rating = lookup(entry, 'Rating')
length = lookup(entry, 'Total Time')
if name is None or artist is None or album is None or genre is None : #Guardian Part
continue
print(name, artist, genre , album, count, rating, length)
cur.execute('''INSERT OR IGNORE INTO Artist (name)
VALUES ( ? )''', ( artist, ) )
cur.execute('SELECT id FROM Artist WHERE name = ? ', (artist, ))
artist_id = cur.fetchone()[0]
cur.execute('''INSERT OR IGNORE INTO Genre (name)
VALUES ( ? )''', (genre,))
cur.execute('SELECT id FROM Genre WHERE name = ? ', (genre,))
genre_id = cur.fetchone()[0]
cur.execute('''INSERT OR IGNORE INTO Album (title, artist_id)
VALUES ( ?, ? )''', ( album, artist_id ) )
cur.execute('SELECT id FROM Album WHERE title = ? ', (album, ))
album_id = cur.fetchone()[0]
cur.execute('''INSERT OR REPLACE INTO Track
(title, album_id, genre_id , len, rating, count)
VALUES ( ?, ?, ? , ?, ?, ? )''',
( name, album_id, genre_id , length, rating, count ) )
conn.commit()
7、多名学生与课程的数据库
many-to-many.py
import json
import sqlite3
conn = sqlite3.connect('rosterdb.sqlite')
cur = conn.cursor()
# Do some setup
#关于excutescript的作用,可以同时执行多条用分号隔开的语句
cur.executescript('''
DROP TABLE IF EXISTS User;
DROP TABLE IF EXISTS Member;
DROP TABLE IF EXISTS Course;
CREATE TABLE User (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
name TEXT UNIQUE
);
CREATE TABLE Course (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
title TEXT UNIQUE
);
CREATE TABLE Member (
user_id INTEGER,
course_id INTEGER,
role INTEGER,
PRIMARY KEY (user_id, course_id)
)
''')
fname = input('Enter file name: ')
if len(fname) < 1:
fname = 'roster_data.json'
# [
# [ "Charley", "si110", 1 ],
# [ "Mea", "si110", 0 ],
str_data = open(fname).read()
json_data = json.loads(str_data)
for entry in json_data:
name = entry[0];
title = entry[1];
role = entry[2];
print((name, title , role))
cur.execute('''INSERT OR IGNORE INTO User (name)
VALUES ( ? )''', ( name, ) )
cur.execute('SELECT id FROM User WHERE name = ? ', (name, ))
user_id = cur.fetchone()[0]
cur.execute('''INSERT OR IGNORE INTO Course (title)
VALUES ( ? )''', ( title, ) )
cur.execute('SELECT id FROM Course WHERE title = ? ', (title, ))
course_id = cur.fetchone()[0]
cur.execute('''INSERT OR REPLACE INTO Member
(user_id, course_id , role) VALUES ( ?, ? , ? )''',
( user_id, course_id , role) )
conn.commit()
8、Geocoding
geoload.py-读(parse)
import urllib.request, urllib.parse, urllib.error
import http
import sqlite3
import json
import time
import ssl
import sys
api_key = False
# If you have a Google Places API key, enter it here
# api_key = 'AIzaSy___IDByT70'
if api_key is False:
api_key = 42
serviceurl = "http://py4e-data.dr-chuck.net/json?"
else :
serviceurl = "https://maps.googleapis.com/maps/api/geocode/json?"
# Additional detail for urllib
# http.client.HTTPConnection.debuglevel = 1
conn = sqlite3.connect('geodata.sqlite')
cur = conn.cursor()
cur.execute('''
CREATE TABLE IF NOT EXISTS Locations (address TEXT, geodata TEXT)''')
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
fh = open("where.data")
count = 0
for line in fh:
if count > 200 :
print('Retrieved 200 locations, restart to retrieve more')
break
address = line.strip()
print('')
cur.execute("SELECT geodata FROM Locations WHERE address= ?",
(memoryview(address.encode()), ))
try:
data = cur.fetchone()[0]
print("Found in database ",address)
continue
except:
pass
parms = dict()
parms["address"] = address
if api_key is not False: parms['key'] = api_key
url = serviceurl + urllib.parse.urlencode(parms)
print('Retrieving', url)
uh = urllib.request.urlopen(url, context=ctx)
data = uh.read().decode()
print('Retrieved', len(data), 'characters', data[:20].replace('\n', ' '))
count = count + 1
try:
js = json.loads(data)
except:
print(data) # We print in case unicode causes an error
continue
if 'status' not in js or (js['status'] != 'OK' and js['status'] != 'ZERO_RESULTS') :
print('==== Failure To Retrieve ====')
print(data)
break
cur.execute('''INSERT INTO Locations (address, geodata)
VALUES ( ?, ? )''', (memoryview(address.encode()), memoryview(data.encode()) ) )
conn.commit()
if count % 10 == 0 :
print('Pausing for a bit...')
time.sleep(5)
print("Run geodump.py to read the data from the database so you can vizualize it on a map.")
geodump.py-与json交互
import sqlite3
import json
import codecs
conn = sqlite3.connect('geodata.sqlite')
cur = conn.cursor()
cur.execute('SELECT * FROM Locations')
fhand = codecs.open('where.js', 'w', "utf-8")
fhand.write("myData = [\n")
count = 0
for row in cur :
data = str(row[1].decode())
try: js = json.loads(str(data))
except: continue
if not('status' in js and js['status'] == 'OK') : continue
lat = js["results"][0]["geometry"]["location"]["lat"]
lng = js["results"][0]["geometry"]["location"]["lng"]
if lat == 0 or lng == 0 : continue
where = js['results'][0]['formatted_address']
where = where.replace("'", "")
try :
print(where, lat, lng)
count = count + 1
if count > 1 : fhand.write(",\n")
output = "["+str(lat)+","+str(lng)+", '"+where+"']"
fhand.write(output)
except:
continue
fhand.write("\n];\n")
cur.close()
fhand.close()
print(count, "records written to where.js")
print("Open where.html to view the data in a browser")