|
The topics of the kubercon (Kubernetes conference)
Markdown doc 'source.md' with all the presentation titles plus links:
[2000 Nodes and Beyond: How We Scaled Kubernetes to 60,000-Container Clusters
and Where We're Going Next - Marek Grabowski, Google Willow
A](/event/8K8w/2000-nodes-and-beyond-how-we-scaled-kubernetes-to-60000
-container-clusters-and-where-were-going-next-marek-grabowski-google) [How Box
Runs Containers in Production with Kubernetes - Sam Ghods, Box Grand Ballroom
D](/event/8K8u/how-box-runs-containers-in-production-with-kubernetes-sam-
ghods-box) [ITNW (If This Now What) - Orchestrating an Enterprise - Michael
Ward, Pearson Grand Ballroom C](/event/8K8t/itnw-if-this-now-what-
orchestrating-an-enterprise-michael-ward-pearson) [Unik: Unikernel Runtime for
Kubernetes - Idit Levine, EMC Redwood AB](/event/8K8v/unik-unikernel-runtime-
..
..
Step 1: generate download script
Grab the links from 'source.md' and download them.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
buf=""
infile = file('source.md', 'r')
for line in infile.readlines():
buf+=line.rstrip('\n')
oo=1
while True:
match = re.search( '^(.*?\()(/.[^\)]*)(\).*$)', buf)
if match is None:
break
url="https://cnkc16.sched.org"+match.group(2)
print "wget '{}' -O {:0>4d}.html".format(url,oo)
oo+=1
buf=match.group(3)
Step 2: download the html
Execute the script generated by above code, and put the resulting files in directory 'content' :
wget 'https://cnkc16.sched.org/event/8K8w/2000-nodes-and-beyond-how-
we-scaled-kubernetes-to-60000-container-clusters-and-where-were-
going-next-marek-grabowski-google' -O 0001.html
wget 'https://cnkc16.sched.org/event/8K8u/how-box-runs-containers-in-
production-with-kubernetes-sam-ghods-box' -O 0002.html
wget 'https://cnkc16.sched.org/event/8K8t/itnw-if-this-now-what-
orchestrating-an-enterprise-michael-ward-pearson' -O 0003.html
..
Step 3: parse with beautiful soup
#!/usr/bin/python
# -*- coding: utf-8 -*-
from BeautifulSoup import *
import os
import re
import codecs
#outfile = file('text.md', 'w')
# ^^^ --> UnicodeEncodeError:
# 'ascii' codec can't encode character u'\u2019'
# in position 73: ordinal not in range(128)
outfile= codecs.open("text.md", "w", "utf-8")
file_ls=[]
for filename in os.listdir("content"):
if filename.endswith(".html"):
file_ls.append(filename)
for filename in sorted(file_ls):
infile = file('content/'+filename,'r')
content = infile.read()
infile.close()
soup = BeautifulSoup(content.decode('utf-8','ignore'))
div= soup.find('div', attrs={'class':'sched-container-inner'})
el_ls= div.findAll('span')
el=el_ls[0].text.strip()
title=re.sub(' - .*$','',el)
speaker=re.sub('^.* - ','',el)
outfile.write( u'\n\n## {}\n'.format(title))
outfile.write( u'\n\n{}\n'.format(speaker) )
det= div.find('div', attrs={'class':'tip-description'})
if det is not None:
outfile.write( u'\n{}\n'.format(det.text.strip() ) )
beautifulsoup sqlite
20161003
Create a DB by scraping a webpage
Download all the webpages and put them in a zipfile (to avoid 're-downloading' on each try).
If you want to work 'direct', then use this to read the html content of a url:
html_doc=urllib.urlopen(url).read()
Preparation: create database table
cur.execute('DROP TABLE IF EXISTS t_result')
cur.execute('''
CREATE TABLE t_result(
pos varchar(128),
nr varchar(128),
gesl varchar(128),
naam varchar(128),
leeftijd varchar(128),
ioc varchar(128),
tijd varchar(128),
tkm varchar(128),
gem varchar(128),
cat_plaats varchar(128),
cat_naam varchar(128),
gemeente varchar(128)
)
''') ##
Pull each html file from the zipfile
zf=zipfile.ZipFile('brx_marathon_html.zip','r')
for fn in zf.namelist():
try:
content= zf.read(fn)
handle_content(content)
except KeyError:
print 'ERROR: %s not in zip file' % fn
break
Parse the content of each html file with Beautiful Soup
soup = BeautifulSoup(content)
table= soup.find('table', attrs={'cellspacing':'0', 'cellpadding':'2'})
rows = table.findAll('tr')
for row in rows:
cols = row.findAll('td')
e = [ ele.text.strip() for ele in cols]
if len(e)>10:
cur.execute('INSERT INTO T_RESULT VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,? )',
(e[0],e[1],e[2],e[3],e[4],e[5],e[6],e[7],e[8],e[9],e[10],e[11]) )
Note: the above code is beautiful soup 3, for beautiful soup 4, the findAll needs to be replaced by find_all .
Complete source code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
| #!/usr/bin/python
from BeautifulSoup import *
import sqlite3
import zipfile
conn = sqlite3.connect('marathon.sqlite')
cur = conn.cursor()
def handle_content(content):
soup = BeautifulSoup(content)
table= soup.find('table', attrs={'cellspacing':'0', 'cellpadding':'2'})
rows = table.findAll('tr') # Note: bs3 findAll = find_all in bs4 !
for row in rows:
cols = row.findAll('td')
e = [ ele.text.strip() for ele in cols]
if len(e)>10:
print u"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
e[0],e[1],e[2],e[3],e[4],e[5],e[6],e[7],e[8],e[9],e[10],e[11])
cur.execute('INSERT INTO T_RESULT VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,? )',
(e[0],e[1],e[2],e[3],e[4],e[5],e[6],e[7],e[8],e[9],e[10],e[11]) )
cur.execute('DROP TABLE IF EXISTS t_result')
cur.execute('''
CREATE TABLE t_result(
pos varchar(128),
nr varchar(128),
gesl varchar(128),
naam varchar(128),
leeftijd varchar(128),
ioc varchar(128),
tijd varchar(128),
tkm varchar(128),
gem varchar(128),
cat_plaats varchar(128),
cat_naam varchar(128),
gemeente varchar(128)
)
''') ##
# MAIN LOOP
# read zipfile, and handle each file
zf=zipfile.ZipFile('brx_marathon_html.zip','r')
for fn in zf.namelist():
try:
content= zf.read(fn)
handle_content(content)
except KeyError:
print 'ERROR: %s not in zip file' % fn
break
cur.close()
conn.commit()
|
| |