|
Incrementally update geocoded data
Startpoint: we have a sqlite3 database with (dirty) citynames and country codes. We would like to have the lat/lon coordinates, for each place.
Here some sample data, scraped from the Brussels marathon-results webpage, indicating the town of residence and nationality of the athlete:
LA CELLE SAINT CLOUD, FRA
TERTRE, BEL
FREDERICIA, DNK
But sometimes the town and country don't match up, eg:
HEVERLEE CHN
WOLUWE-SAINT-PIERRE JPN
BUIZINGEN FRA
For the geocoding we make use of nominatim.openstreetmap.org. The 3-letter country code also needs to be translated into a country name, for which we use the file /u01/data/20150215_country_code_iso/country_codes.txt .
The places for which we don't get valid (lat,lon) coordinates we put (0,0).
We run this script multiple times, small batches in the beginning, to be able see what the exceptions occur, and bigger batches in the end (when problems have been solved).
In between runs the data may be manually modified by opening the sqlite3 database, and updating/deleting the t_geocode table.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
| #!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
import sqlite3
import json
import pandas as pd
conn = sqlite3.connect('marathon.sqlite')
cur = conn.cursor()
# do we need to create the t_geocode table?
cur.execute("SELECT count(1) from sqlite_master WHERE type='table' AND name='t_geocode' ")
n=cur.fetchone()[0]
if n==0:
cur.execute('''
CREATE TABLE t_geocode(
gemeente varchar(128),
ioc varchar(128),
lat double,
lon double
)''')
# read the file with mapping from 3 digit country code to country name into a dataframe
df=pd.io.parsers.read_table('/u01/data/20150215_country_code_iso/country_codes.txt',sep='|')
# turn the dataframe into a dictionary
c3d=df.set_index('c3')['country'].to_dict()
# select the records to geocode
cur.execute(''' SELECT distinct r.gemeente,r.ioc,g.gemeente, g.ioc
FROM t_result r
left outer join t_geocode g
on (r.gemeente=g.gemeente and r.ioc=g.ioc)
WHERE g.gemeente is null ''')
n=50 # batch size: change this to your liking. Small in the beginning (look at exceptional cases etc..)
for row in cur.fetchall():
(g,c)=row[:2] # ignore the latter 2 fields
print "---", g,c, "----------------------------------"
if g=='X' or len(g)==0 or g=='Gemeente':
print "skip"
continue
cy=c3d[c]
print "{} -> {}".format(c,cy)
url=u'http://nominatim.openstreetmap.org/search?country={}&city={}&format=json'.format( cy,g )
r = requests.get(url)
jr=json.loads(r.text)
(lat,lon)=(0.0,0.0)
if len(jr)>0:
lat=jr[0]['lat']
lon=jr[0]['lon']
print "{} {} {} -> ({},{})".format(g,c,cy,lat,lon)
cur.execute('''insert into t_geocode( gemeente , ioc , lat, lon)
values(?,?,?,?)''', ( g,c,lat,lon) )
# batch
n-=1
if n==0:
break
cur.close()
conn.commit()
|
A few queries
Total number of places:
select count(1) from t_geocode
916
Number of places for which we didn't find valid coordinates :
select count(1) from t_geocode where lat=0 and lon=0
185
beautifulsoup sqlite
20161003
Create a DB by scraping a webpage
Download all the webpages and put them in a zipfile (to avoid 're-downloading' on each try).
If you want to work 'direct', then use this to read the html content of a url:
html_doc=urllib.urlopen(url).read()
Preparation: create database table
cur.execute('DROP TABLE IF EXISTS t_result')
cur.execute('''
CREATE TABLE t_result(
pos varchar(128),
nr varchar(128),
gesl varchar(128),
naam varchar(128),
leeftijd varchar(128),
ioc varchar(128),
tijd varchar(128),
tkm varchar(128),
gem varchar(128),
cat_plaats varchar(128),
cat_naam varchar(128),
gemeente varchar(128)
)
''') ##
Pull each html file from the zipfile
zf=zipfile.ZipFile('brx_marathon_html.zip','r')
for fn in zf.namelist():
try:
content= zf.read(fn)
handle_content(content)
except KeyError:
print 'ERROR: %s not in zip file' % fn
break
Parse the content of each html file with Beautiful Soup
soup = BeautifulSoup(content)
table= soup.find('table', attrs={'cellspacing':'0', 'cellpadding':'2'})
rows = table.findAll('tr')
for row in rows:
cols = row.findAll('td')
e = [ ele.text.strip() for ele in cols]
if len(e)>10:
cur.execute('INSERT INTO T_RESULT VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,? )',
(e[0],e[1],e[2],e[3],e[4],e[5],e[6],e[7],e[8],e[9],e[10],e[11]) )
Note: the above code is beautiful soup 3, for beautiful soup 4, the findAll needs to be replaced by find_all .
Complete source code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
| #!/usr/bin/python
from BeautifulSoup import *
import sqlite3
import zipfile
conn = sqlite3.connect('marathon.sqlite')
cur = conn.cursor()
def handle_content(content):
soup = BeautifulSoup(content)
table= soup.find('table', attrs={'cellspacing':'0', 'cellpadding':'2'})
rows = table.findAll('tr') # Note: bs3 findAll = find_all in bs4 !
for row in rows:
cols = row.findAll('td')
e = [ ele.text.strip() for ele in cols]
if len(e)>10:
print u"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
e[0],e[1],e[2],e[3],e[4],e[5],e[6],e[7],e[8],e[9],e[10],e[11])
cur.execute('INSERT INTO T_RESULT VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,? )',
(e[0],e[1],e[2],e[3],e[4],e[5],e[6],e[7],e[8],e[9],e[10],e[11]) )
cur.execute('DROP TABLE IF EXISTS t_result')
cur.execute('''
CREATE TABLE t_result(
pos varchar(128),
nr varchar(128),
gesl varchar(128),
naam varchar(128),
leeftijd varchar(128),
ioc varchar(128),
tijd varchar(128),
tkm varchar(128),
gem varchar(128),
cat_plaats varchar(128),
cat_naam varchar(128),
gemeente varchar(128)
)
''') ##
# MAIN LOOP
# read zipfile, and handle each file
zf=zipfile.ZipFile('brx_marathon_html.zip','r')
for fn in zf.namelist():
try:
content= zf.read(fn)
handle_content(content)
except KeyError:
print 'ERROR: %s not in zip file' % fn
break
cur.close()
conn.commit()
|
| |