Talk:Machine Learning
Feb. 27, 2014
Folks met and hacked on the noisebridge discuss mailing list. We created a 102MB text dump, and a python script to parse it, File:Py-piper-parser.txt. We wrote pseudo code to implement a Naive Bayesian filter to protect the world from trolls. Will implement soon.
python to download and decompress nb-discuss archive
from StringIO import StringIO
from gzip import GzipFile
from time import gmtime
from urllib import urlopen
def decompress_from_url(u):
# return GzipFile(fileobj = StringIO(urlopen(u).read())).read()
f = urlopen(u)
fs = StringIO(f.read())
g = GzipFile(fileobj = fs)
s = g.read()
for x in (f, fs, g):
x.close()
return s
def discuss_gz_url(m, y):
if m < 1 or m > 12:
return None
if y < 2007:
return None
now = gmtime()
if (y > now.tm_year) or (y == now.tm_year and m > now.tm_mon):
return None
mm = ('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December')
nb_pre = 'https://www.noisebridge.net/pipermail/noisebridge-discuss/'
nb_post = '.txt.gz'
s = '-'.join((str(y), mm[m-1]))
return ''.join((nb_pre, s, nb_post))
def all_discuss_gz_urls():
now = gmtime()
for y in range(2007, now.tm_year + 1):
if y == 2007:
mm = range(11, 12 + 1) # start with November 2007
elif y == now.tm_year:
mm = range(1, now.tm_mon + 1) # end with current month
else:
mm = range(1, 13)
for m in mm:
yield discuss_gz_url(m, y)
def discuss_a_month(month, year):
u = discuss_gz_url(month, year)
s = decompress_from_url(u)
return s
def spew():
for u in all_discuss_gz_urls():
yield decompress_from_url(u)
def dump_uncompressed(filename="nb_wtf.txt"):
with open(filename, "w") as f:
for s in spew():
f.write(s)
Word parsing python script
Function 'get_words' takes list of dictionary of emails. Yields lists of words of in the message, for each message:
def get_words(lst):
for d in lst:
m = d['messageline']
yield m.split()
Plans to improve by using nltk[1]