The Python Book
 
beautifulsoup
20161013

The topics of the kubercon (Kubernetes conference)

Input

Markdown doc 'source.md' with all the presentation titles plus links:

[2000 Nodes and Beyond: How We Scaled Kubernetes to 60,000-Container Clusters
and Where We're Going Next - Marek Grabowski, Google Willow
A](/event/8K8w/2000-nodes-and-beyond-how-we-scaled-kubernetes-to-60000
-container-clusters-and-where-were-going-next-marek-grabowski-google) [How Box
Runs Containers in Production with Kubernetes - Sam Ghods, Box Grand Ballroom
D](/event/8K8u/how-box-runs-containers-in-production-with-kubernetes-sam-
ghods-box) [ITNW (If This Now What) - Orchestrating an Enterprise - Michael
Ward, Pearson Grand Ballroom C](/event/8K8t/itnw-if-this-now-what-
orchestrating-an-enterprise-michael-ward-pearson) [Unik: Unikernel Runtime for
Kubernetes - Idit Levine, EMC Redwood AB](/event/8K8v/unik-unikernel-runtime-
..
..

Step 1: generate download script

Grab the links from 'source.md' and download them.

#!/usr/bin/python 
# -*- coding: utf-8 -*-

import re

buf=""
infile = file('source.md', 'r')
for line in infile.readlines():
    buf+=line.rstrip('\n')

oo=1
while True:
    match = re.search( '^(.*?\()(/.[^\)]*)(\).*$)', buf)
    if match is None:
        break
    url="https://cnkc16.sched.org"+match.group(2)
    print "wget '{}' -O {:0>4d}.html".format(url,oo)
    oo+=1
    buf=match.group(3)

Step 2: download the html

Execute the script generated by above code, and put the resulting files in directory 'content' :

wget 'https://cnkc16.sched.org/event/8K8w/2000-nodes-and-beyond-how-
      we-scaled-kubernetes-to-60000-container-clusters-and-where-were-
      going-next-marek-grabowski-google' -O 0001.html
wget 'https://cnkc16.sched.org/event/8K8u/how-box-runs-containers-in-
      production-with-kubernetes-sam-ghods-box' -O 0002.html
wget 'https://cnkc16.sched.org/event/8K8t/itnw-if-this-now-what-
      orchestrating-an-enterprise-michael-ward-pearson' -O 0003.html
.. 

Step 3: parse with beautiful soup

#!/usr/bin/python 
# -*- coding: utf-8 -*-

from BeautifulSoup import *
import os
import re
import codecs

#outfile = file('text.md', 'w')
# ^^^ --> UnicodeEncodeError: 
#                'ascii' codec can't encode character u'\u2019' 
#                in position 73: ordinal not in range(128)
outfile= codecs.open("text.md", "w", "utf-8")

file_ls=[]
for filename in os.listdir("content"):
    if filename.endswith(".html"):
        file_ls.append(filename)

for filename in sorted(file_ls):
    infile = file('content/'+filename,'r')
    content = infile.read()
    infile.close()
    soup = BeautifulSoup(content.decode('utf-8','ignore'))

    div= soup.find('div', attrs={'class':'sched-container-inner'})
    el_ls= div.findAll('span')

    el=el_ls[0].text.strip()
    title=re.sub(' - .*$','',el)
    speaker=re.sub('^.* - ','',el)

    outfile.write( u'\n\n## {}\n'.format(title))
    outfile.write( u'\n\n{}\n'.format(speaker) )

    det= div.find('div', attrs={'class':'tip-description'})
    if det is not None:
        outfile.write( u'\n{}\n'.format(det.text.strip() ) )
beautifulsoup sqlite
20161003

Create a DB by scraping a webpage

Download all the webpages and put them in a zipfile (to avoid 're-downloading' on each try).

If you want to work 'direct', then use this to read the html content of a url:

html_doc=urllib.urlopen(url).read()

Preparation: create database table

cur.execute('DROP TABLE IF EXISTS t_result')

cur.execute('''
CREATE TABLE t_result(
        pos  varchar(128),
        nr  varchar(128),
        gesl varchar(128),
        naam varchar(128),
        leeftijd varchar(128),
        ioc varchar(128),
        tijd varchar(128),
        tkm varchar(128),
        gem varchar(128),
        cat_plaats varchar(128),
        cat_naam varchar(128),
        gemeente varchar(128)
        ) 
''') ## 

Pull each html file from the zipfile

zf=zipfile.ZipFile('brx_marathon_html.zip','r')
for fn in zf.namelist():
    try:
        content= zf.read(fn)
        handle_content(content)
    except KeyError:
        print 'ERROR: %s not in zip file' % fn
        break

Parse the content of each html file with Beautiful Soup

soup = BeautifulSoup(content)

table= soup.find('table', attrs={'cellspacing':'0', 'cellpadding':'2'})
rows = table.findAll('tr')          
for row in rows:
    cols = row.findAll('td')
    e = [ ele.text.strip()  for ele in cols]
    if len(e)>10:
        cur.execute('INSERT INTO T_RESULT VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,? )',
                    (e[0],e[1],e[2],e[3],e[4],e[5],e[6],e[7],e[8],e[9],e[10],e[11]) )

Note: the above code is beautiful soup 3, for beautiful soup 4, the findAll needs to be replaced by find_all.

Complete source code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/python 

from BeautifulSoup import *
import sqlite3
import zipfile

conn = sqlite3.connect('marathon.sqlite')
cur = conn.cursor()

def handle_content(content): 
    soup = BeautifulSoup(content)

    table= soup.find('table', attrs={'cellspacing':'0', 'cellpadding':'2'}) 
    rows = table.findAll('tr')          # Note: bs3 findAll = find_all in bs4 !
    for row in rows:
        cols = row.findAll('td')
        e = [ ele.text.strip()  for ele in cols]
        if len(e)>10: 
            print u"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
                        e[0],e[1],e[2],e[3],e[4],e[5],e[6],e[7],e[8],e[9],e[10],e[11])  
            cur.execute('INSERT INTO T_RESULT VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,? )', 
                        (e[0],e[1],e[2],e[3],e[4],e[5],e[6],e[7],e[8],e[9],e[10],e[11]) ) 


cur.execute('DROP TABLE IF EXISTS t_result')

cur.execute('''
CREATE TABLE t_result(
        pos  varchar(128),
        nr  varchar(128),
        gesl varchar(128),
        naam varchar(128),
        leeftijd varchar(128),
        ioc varchar(128),
        tijd varchar(128),
        tkm varchar(128),
        gem varchar(128),
        cat_plaats varchar(128),
        cat_naam varchar(128),
        gemeente varchar(128)
        ) 
''') ## 



# MAIN LOOP 
# read zipfile, and handle each file
zf=zipfile.ZipFile('brx_marathon_html.zip','r')
for fn in zf.namelist():
    try:
        content= zf.read(fn)
        handle_content(content) 
    except KeyError:
        print 'ERROR: %s not in zip file' % fn
        break


cur.close()
conn.commit()
 
Notes by Willem Moors. Generated on momo:/home/willem/sync/20151223_datamungingninja/pythonbook at 2019-07-31 19:22