The Python Book

Startpoint: we have a sqlite3 database with (dirty) citynames and country codes. We would like to have the lat/lon coordinates, for each place.

Here some sample data, scraped from the Brussels marathon-results webpage, indicating the town of residence and nationality of the athlete:

For the geocoding we make use of nominatim.openstreetmap.org. The 3-letter country code also needs to be translated into a country name, for which we use the file /u01/data/20150215_country_code_iso/country_codes.txt.

We run this script multiple times, small batches in the beginning, to be able see what the exceptions occur, and bigger batches in the end (when problems have been solved).

In between runs the data may be manually modified by opening the sqlite3 database, and updating/deleting the t_geocode table.

#!/usr/bin/python 
# -*- coding: utf-8 -*-

import requests
import sqlite3
import json
import pandas as pd 

conn = sqlite3.connect('marathon.sqlite')
cur = conn.cursor()

# do we need to create the t_geocode table?
cur.execute("SELECT count(1) from sqlite_master WHERE type='table' AND name='t_geocode' ") 
n=cur.fetchone()[0]
if n==0:
    cur.execute('''
            CREATE TABLE t_geocode(
                gemeente varchar(128),
                ioc varchar(128),
                lat double,
                lon double
                )''') 

# read the file with mapping from 3 digit country code to country name into a dataframe
df=pd.io.parsers.read_table('/u01/data/20150215_country_code_iso/country_codes.txt',sep='|')

# turn the dataframe into a dictionary
c3d=df.set_index('c3')['country'].to_dict()

# select the records to geocode 
cur.execute(''' SELECT distinct r.gemeente,r.ioc,g.gemeente, g.ioc
                FROM   t_result r 
                       left outer join t_geocode g
                                   on (r.gemeente=g.gemeente and r.ioc=g.ioc)
                WHERE g.gemeente is null ''') 

n=50 # batch size: change this to your liking. Small in the beginning (look at exceptional cases etc..) 
for row in cur.fetchall(): 
    (g,c)=row[:2] # ignore the latter 2 fields
    print "---", g,c, "----------------------------------"
    if g=='X' or len(g)==0 or g=='Gemeente':
        print "skip"
        continue

    cy=c3d[c]
    print "{} -> {}".format(c,cy) 
    
    url=u'http://nominatim.openstreetmap.org/search?country={}&city={}&format=json'.format( cy,g ) 
    r = requests.get(url)
    jr=json.loads(r.text)
    (lat,lon)=(0.0,0.0)
    if len(jr)>0:
        lat=jr[0]['lat']
        lon=jr[0]['lon']
    print "{} {} {} -> ({},{})".format(g,c,cy,lat,lon)
    cur.execute('''insert into t_geocode( gemeente , ioc , lat, lon)
                       values(?,?,?,?)''', ( g,c,lat,lon) ) 
    # batch
    n-=1
    if n==0:
        break

cur.close()
conn.commit()

A few queries

Create a DB by scraping a webpage

Download all the webpages and put them in a zipfile (to avoid 're-downloading' on each try).

Preparation: create database table

Pull each html file from the zipfile

Parse the content of each html file with Beautiful Soup

Note: the above code is beautiful soup 3, for beautiful soup 4, the findAll needs to be replaced by find_all.

Complete source code

#!/usr/bin/python 

from BeautifulSoup import *
import sqlite3
import zipfile

conn = sqlite3.connect('marathon.sqlite')
cur = conn.cursor()

def handle_content(content): 
    soup = BeautifulSoup(content)

    table= soup.find('table', attrs={'cellspacing':'0', 'cellpadding':'2'}) 
    rows = table.findAll('tr')          # Note: bs3 findAll = find_all in bs4 !
    for row in rows:
        cols = row.findAll('td')
        e = [ ele.text.strip()  for ele in cols]
        if len(e)>10: 
            print u"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
                        e[0],e[1],e[2],e[3],e[4],e[5],e[6],e[7],e[8],e[9],e[10],e[11])  
            cur.execute('INSERT INTO T_RESULT VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,? )', 
                        (e[0],e[1],e[2],e[3],e[4],e[5],e[6],e[7],e[8],e[9],e[10],e[11]) ) 


cur.execute('DROP TABLE IF EXISTS t_result')

cur.execute('''
CREATE TABLE t_result(
        pos  varchar(128),
        nr  varchar(128),
        gesl varchar(128),
        naam varchar(128),
        leeftijd varchar(128),
        ioc varchar(128),
        tijd varchar(128),
        tkm varchar(128),
        gem varchar(128),
        cat_plaats varchar(128),
        cat_naam varchar(128),
        gemeente varchar(128)
        ) 
''') ## 



# MAIN LOOP 
# read zipfile, and handle each file
zf=zipfile.ZipFile('brx_marathon_html.zip','r')
for fn in zf.namelist():
    try:
        content= zf.read(fn)
        handle_content(content) 
    except KeyError:
        print 'ERROR: %s not in zip file' % fn
        break


cur.close()
conn.commit()