The Python Book

Execute the script generated by above code, and put the resulting files in directory 'content' :

Step 3: parse with beautiful soup

Create a DB by scraping a webpage

Download all the webpages and put them in a zipfile (to avoid 're-downloading' on each try).

Preparation: create database table

Pull each html file from the zipfile

Parse the content of each html file with Beautiful Soup

Note: the above code is beautiful soup 3, for beautiful soup 4, the findAll needs to be replaced by find_all.

Complete source code

#!/usr/bin/python 

from BeautifulSoup import *
import sqlite3
import zipfile

conn = sqlite3.connect('marathon.sqlite')
cur = conn.cursor()

def handle_content(content): 
    soup = BeautifulSoup(content)

    table= soup.find('table', attrs={'cellspacing':'0', 'cellpadding':'2'}) 
    rows = table.findAll('tr')          # Note: bs3 findAll = find_all in bs4 !
    for row in rows:
        cols = row.findAll('td')
        e = [ ele.text.strip()  for ele in cols]
        if len(e)>10: 
            print u"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
                        e[0],e[1],e[2],e[3],e[4],e[5],e[6],e[7],e[8],e[9],e[10],e[11])  
            cur.execute('INSERT INTO T_RESULT VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,? )', 
                        (e[0],e[1],e[2],e[3],e[4],e[5],e[6],e[7],e[8],e[9],e[10],e[11]) ) 


cur.execute('DROP TABLE IF EXISTS t_result')

cur.execute('''
CREATE TABLE t_result(
        pos  varchar(128),
        nr  varchar(128),
        gesl varchar(128),
        naam varchar(128),
        leeftijd varchar(128),
        ioc varchar(128),
        tijd varchar(128),
        tkm varchar(128),
        gem varchar(128),
        cat_plaats varchar(128),
        cat_naam varchar(128),
        gemeente varchar(128)
        ) 
''') ## 



# MAIN LOOP 
# read zipfile, and handle each file
zf=zipfile.ZipFile('brx_marathon_html.zip','r')
for fn in zf.namelist():
    try:
        content= zf.read(fn)
        handle_content(content) 
    except KeyError:
        print 'ERROR: %s not in zip file' % fn
        break


cur.close()
conn.commit()

The topics of the kubercon (Kubernetes conference)

Input

Step 1: generate download script

Step 2: download the html