The Python Book

df.head(500).tail(10)


      No   Latitude  Longitude  Altitude  Speed  Heartrate  Cadence        Date      Time
490  491  50.855181   4.826737      78.2   2.79        144     78.0  2019/07/13  06:44:22
491  492  50.855136   4.826739      77.6   2.79        147     78.0  2019/07/13  06:44:24
492  493  50.854962   4.826829      76.2   2.77        148     77.0  2019/07/13  06:44:32
493  494  50.854778   4.826951      77.4   2.77        146     78.0  2019/07/13  06:44:41
494  495  50.854631   4.827062      78.0   2.71        143     78.0  2019/07/13  06:44:49
495  496  50.854531   4.827174      79.2   2.70        146     77.0  2019/07/13  06:44:54
496  497  50.854472   4.827249      79.2   2.73        149     77.0  2019/07/13  06:44:57
497  498  50.854315   4.827418      79.8   2.74        149     76.0  2019/07/13  06:45:05
498  499  50.854146   4.827516      77.4   2.67        147     76.0  2019/07/13  06:45:14
499  500  50.853985   4.827430      79.0   2.59        144     75.0  2019/07/13  06:45:22

#  function to approximately calculate the distance between 2 points
#  from: http://www.movable-type.co.uk/scripts/latlong.html
def rough_distance(lat1, lon1, lat2, lon2):
    lat1 = lat1 * math.pi / 180.0
    lon1 = lon1 * math.pi / 180.0
    lat2 = lat2 * math.pi / 180.0
    lon2 = lon2 * math.pi / 180.0
    r = 6371.0 #// km
    x = (lon2 - lon1) * math.cos((lat1+lat2)/2)
    y = (lat2 - lat1)
    d = math.sqrt(x*x+y*y) * r
    return d

ds=[]
(d,priorlat,priorlon)=(0.0, 0.0, 0.0)
for t in df[['Latitude','Longitude']].itertuples():
    if len(ds)>0:
        d+=rough_distance(t.Latitude,t.Longitude, priorlat, priorlon)
    ds.append(d)
    (priorlat,priorlon)=(t.Latitude,t.Longitude)
 
df['CumulativeDist']=ds

df.plot(kind='line',x='CumulativeDist',y='Heartrate',color='red')
plt.show()

plt.plot( df.CumulativeDist, df.Heartrate, color='red')
plt.plot( df.CumulativeDist, df.Altitude, color='blue')
plt.show()

Turn a dataframe into an array

cn

                     asciiname  population  elevation
128677                Crossett        5507         58
7990    Santa Maria da Vitoria       23488        438
25484                 Shanling           0        628
95882     Colonia Santa Teresa       36845       2286
38943                 Blomberg        1498          4
7409              Missao Velha       13106        364
36937                  Goerzig        1295         81


cn.iloc[range(len(cn))].values


array([['Yuhu', 0, 15],
       ['Traventhal', 551, 42],
       ['Velabisht', 0, 60],
       ['Almorox', 2319, 539],
       ['Abuyog', 15632, 6],
       ['Zhangshan', 0, 132],
       ['Llica de Vall', 0, 136],
       ['Capellania', 2252, 31],
       ['Mezocsat', 6519, 91],
       ['Vars', 1634, 52]], dtype=object)

Sidenote: cn was pulled from city data: cn=df.sample(7)[['asciiname','population','elevation']].

Filter a dataframe to retain rows with non-null values

In short

df=df[df['population'].notnull()]

df['population']=df['population'].fillna(0)

In detail

import numpy as np
import pandas as pd

# setup the dataframe
data=[[ 'Isle of Skye',         9232, 124 ],
      [ 'Vieux-Charmont',     np.nan, 320 ],
      [ 'Indian Head',          3844,  35 ],
      [ 'Cihua',              np.nan, 178 ],
      [ 'Miasteczko Slaskie',   7327, 301 ],
      [ 'Wawa',               np.nan,   7 ],
      [ 'Bat Khela',           46079, 673 ]]

df=pd.DataFrame(data, columns=['asciiname','population','elevation'])
 
#display the dataframe
df

            asciiname  population  elevation
0        Isle of Skye      9232.0        124
1      Vieux-Charmont         NaN        320
2         Indian Head      3844.0         35
3               Cihua         NaN        178
4  Miasteczko Slaskie      7327.0        301
5                Wawa         NaN          7


# retain only the rows where population has a non-null value
df=df[df['population'].notnull()]

            asciiname  population  elevation
0        Isle of Skye      9232.0        124
2         Indian Head      3844.0         35
4  Miasteczko Slaskie      7327.0        301
6           Bat Khela     46079.0        673

List EXIF details of a photo

#!/usr/bin/python 

import exifread
import os

def handle_file(fn):
    jp=open(fn,'rb')
    tags=exifread.process_file(jp)
    #for k in tags.keys(): 
    #    if k!='JPEGThumbnail':
    #        print k,"->",tags[k]

    dt=tags.get('EXIF DateTimeOriginal','UNK')
    iw=tags.get('EXIF ExifImageWidth',  'UNK')
    ih=tags.get('EXIF ExifImageLength', 'UNK')
    im=tags.get('Image Model', 'UNK')
    fs=os.path.getsize(fn)
    print "{}@fs={}|dt={}|iw={}|ih={}|im={}".format(fn,fs,dt,iw,ih,im)


startdir='/home/willem/sync/note/bootstrap/ex09_carousel'
#startdir='/media/willem/EOS_DIGITAL/DCIM'
for dirname, subdirlist, filelist in os.walk(startdir):
    for fn in filelist:
        if fn.endswith('.jpg') or fn.endswith('.JPG'):
            handle_file(dirname+'/'+fn)

The topics of the kubercon (Kubernetes conference)

Input

Step 1: generate download script

Step 2: download the html

Execute the script generated by above code, and put the resulting files in directory 'content' :

Step 3: parse with beautiful soup

Incrementally update geocoded data

Startpoint: we have a sqlite3 database with (dirty) citynames and country codes. We would like to have the lat/lon coordinates, for each place.

Here some sample data, scraped from the Brussels marathon-results webpage, indicating the town of residence and nationality of the athlete:

For the geocoding we make use of nominatim.openstreetmap.org. The 3-letter country code also needs to be translated into a country name, for which we use the file /u01/data/20150215_country_code_iso/country_codes.txt.

We run this script multiple times, small batches in the beginning, to be able see what the exceptions occur, and bigger batches in the end (when problems have been solved).

In between runs the data may be manually modified by opening the sqlite3 database, and updating/deleting the t_geocode table.

#!/usr/bin/python 
# -*- coding: utf-8 -*-

import requests
import sqlite3
import json
import pandas as pd 

conn = sqlite3.connect('marathon.sqlite')
cur = conn.cursor()

# do we need to create the t_geocode table?
cur.execute("SELECT count(1) from sqlite_master WHERE type='table' AND name='t_geocode' ") 
n=cur.fetchone()[0]
if n==0:
    cur.execute('''
            CREATE TABLE t_geocode(
                gemeente varchar(128),
                ioc varchar(128),
                lat double,
                lon double
                )''') 

# read the file with mapping from 3 digit country code to country name into a dataframe
df=pd.io.parsers.read_table('/u01/data/20150215_country_code_iso/country_codes.txt',sep='|')

# turn the dataframe into a dictionary
c3d=df.set_index('c3')['country'].to_dict()

# select the records to geocode 
cur.execute(''' SELECT distinct r.gemeente,r.ioc,g.gemeente, g.ioc
                FROM   t_result r 
                       left outer join t_geocode g
                                   on (r.gemeente=g.gemeente and r.ioc=g.ioc)
                WHERE g.gemeente is null ''') 

n=50 # batch size: change this to your liking. Small in the beginning (look at exceptional cases etc..) 
for row in cur.fetchall(): 
    (g,c)=row[:2] # ignore the latter 2 fields
    print "---", g,c, "----------------------------------"
    if g=='X' or len(g)==0 or g=='Gemeente':
        print "skip"
        continue

    cy=c3d[c]
    print "{} -> {}".format(c,cy) 
    
    url=u'http://nominatim.openstreetmap.org/search?country={}&city={}&format=json'.format( cy,g ) 
    r = requests.get(url)
    jr=json.loads(r.text)
    (lat,lon)=(0.0,0.0)
    if len(jr)>0:
        lat=jr[0]['lat']
        lon=jr[0]['lon']
    print "{} {} {} -> ({},{})".format(g,c,cy,lat,lon)
    cur.execute('''insert into t_geocode( gemeente , ioc , lat, lon)
                       values(?,?,?,?)''', ( g,c,lat,lon) ) 
    # batch
    n-=1
    if n==0:
        break

cur.close()
conn.commit()

A few queries

Turn a pandas dataframe into a dictionary

Create a DB by scraping a webpage

Download all the webpages and put them in a zipfile (to avoid 're-downloading' on each try).

Preparation: create database table

Pull each html file from the zipfile

Parse the content of each html file with Beautiful Soup

Note: the above code is beautiful soup 3, for beautiful soup 4, the findAll needs to be replaced by find_all.

Complete source code

#!/usr/bin/python 

from BeautifulSoup import *
import sqlite3
import zipfile

conn = sqlite3.connect('marathon.sqlite')
cur = conn.cursor()

def handle_content(content): 
    soup = BeautifulSoup(content)

    table= soup.find('table', attrs={'cellspacing':'0', 'cellpadding':'2'}) 
    rows = table.findAll('tr')          # Note: bs3 findAll = find_all in bs4 !
    for row in rows:
        cols = row.findAll('td')
        e = [ ele.text.strip()  for ele in cols]
        if len(e)>10: 
            print u"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
                        e[0],e[1],e[2],e[3],e[4],e[5],e[6],e[7],e[8],e[9],e[10],e[11])  
            cur.execute('INSERT INTO T_RESULT VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,? )', 
                        (e[0],e[1],e[2],e[3],e[4],e[5],e[6],e[7],e[8],e[9],e[10],e[11]) ) 


cur.execute('DROP TABLE IF EXISTS t_result')

cur.execute('''
CREATE TABLE t_result(
        pos  varchar(128),
        nr  varchar(128),
        gesl varchar(128),
        naam varchar(128),
        leeftijd varchar(128),
        ioc varchar(128),
        tijd varchar(128),
        tkm varchar(128),
        gem varchar(128),
        cat_plaats varchar(128),
        cat_naam varchar(128),
        gemeente varchar(128)
        ) 
''') ## 



# MAIN LOOP 
# read zipfile, and handle each file
zf=zipfile.ZipFile('brx_marathon_html.zip','r')
for fn in zf.namelist():
    try:
        content= zf.read(fn)
        handle_content(content) 
    except KeyError:
        print 'ERROR: %s not in zip file' % fn
        break


cur.close()
conn.commit()

Python string format

Convert a Garmin .FIT file and plot the heartrate of your run