#!/usr/local/bin/python
import sys
import os
import re
import math
import cPickle
from utils import *
from glob import glob
def incr(hash,key):
hash[key]=hash.get(key,0)+1
def getwords(fn,addto):
#print "Read %s..."%fn
f=open(fn)
text=f.read()
f.close()
for word in text.split():
if len(word)<20:
lw=word.lower()
incr(addto,lw)
incr(addto,'*')
def spamcolor(x):
"Maps x in [0,1] to ['00ff00' , 'ff0000']"
return "%06x"% (int(255*(1.0-x))*0x000000 + int(255*x)*0x010000)
class spamly_words:
def fontify(self,text):
headers,body=(text+'\n\n').split('\n\n',1)
headers=re.sub(r'(?mi)^(X-|Errors-To|Reply-To|Date|Delivered-To|Content-|Message-Id|Delivery-Date|Return-Path|MIME-|Received|In-Reply-To|Sender)(.|\n\s)+\n','',headers+'\n')
headers=re.sub(r'\*+SPAM\*+','',headers)
body=re.sub(r'(?m)^SPAM: .*\n','',body)
def colorword(m):
word=m.group(1)
return '%s' % (spamcolor(self.spamliness(word)),word)
cbody=re.sub(r'\b(\w+)\b',colorword,body)
return headers + '\n' + cbody
def fontify_dir(self,indir,outdir):
print "Fontifying %s -> %s"%(indir,outdir)
try:
os.mkdir(outdir)
except:
pass
for infn in glob(indir+'/[0-9]*[0-9]'):
outfn=infn.replace(indir,outdir)
assert(outfn != infn)
outfn+='.html'
self.fontify_file(infn,outfn)
def fontify_file(self,infn,outfn):
infile=infn and open(infn) or sys.stdin
intext=infile.read(60000)
if len(intext)<50000:
infile.close()
outtext=self.fontify(intext)
outfile=outfn and open(outfn,'w') or sys.stdout
outfile.write(re.sub(r'\n','
',outtext))
outfile.close()
def fontify_mail(self,box,number):
self.fontify_file('/usr/tlb/Mail/%s/%d' % (box,number),
'/usr/tlb/Mail/%s.spamly.%s/%d.html' % (box,self.name,number))
class spamly_words_inout(spamly_words):
def __init__(self):
self.spam={'*': 0}
self.nonspam={'*': 0}
def spamliness(self,word):
lw=word.lower()
num=float(self.spam.get(lw,0))/self.spam['*']
den=float(self.nonspam.get(lw,0)+2)/self.nonspam['*']
if num==0: return 0.0
return num/(num+den)
class spamly_words_prob(spamly_words):
def __init__(self):
self.prob={'*': 0}
def spamliness(self,word):
lw=word.lower()
return self.prob.get(lw,0.0)
def build_lingspam_corpus():
print "Scanning lingspam_public/bare/part*..."
sw=spamly_words_inout()
for x in glob("lingspam_public/bare/part*/spm*.txt"):
getwords(x,sw.spam)
print "Spam: %d words known" % len(sw.spam)
for x in glob("lingspam_public/bare/part*/[0-9]-*.txt"):
getwords(x,sw.nonspam)
print "Nonspam: %d words known" % len(sw.nonspam)
return sw
def build_lingspamplus_corpus():
print "Scanning lingspam_public/bare/part*..."
sw=spamly_words_inout()
for x in glob("lingspam_public/bare/part*/spm*.txt"):
getwords(x,sw.spam)
print "Spam: %d words known" % len(sw.spam)
for x in glob("lingspam_public/bare/part*/[0-9]-*.txt"):
getwords(x,sw.nonspam)
print "Scanning /usr/tlb/Mail/inbox ..."
for x in glob("/usr/tlb/Mail/inbox/[0-9]*"):
getwords(x,sw.nonspam)
print "Nonspam: %d words known" % len(sw.nonspam)
return sw
def build_archub_corpus():
print "Reading pglist..."
sw=spamly_words_prob()
f=open('pglist')
text=f.read()
f.close()
fixword=re.compile(r'\|')
for m in re.findall(r'\(((?:[\w\-]|(?:\|.*\|))+) ([\d\.]+)\)',text):
word=re.sub(fixword,'',m[0]).lower()
prob=m[1]
sw.prob[word]=float(prob)
return sw
corpuses={}
def save_corpuses():
global corpuses
for x in corpuses:
corpuses[x].name=x
f=open('sc.dump','w')
p=cPickle.Pickler(f,1)
p.dump(corpuses)
f.close()
def load_corpuses():
global corpuses
f=open('sc.dump')
p=cPickle.Unpickler(f)
corpuses=p.load()
f.close()
def isdir(path):
return os.stat(path).st_mode&1
def main():
global corpuses
load_corpuses()
for arg in sys.argv[1:]:
if arg=='-build-lingspam':
corpuses['lingspam']=build_lingspam_corpus()
save_corpuses()
elif arg=='-build-lingspamplus':
corpuses['lingspamplus']=build_lingspamplus_corpus()
save_corpuses()
elif arg=='-build-archub':
corpuses['archub']=build_archub_corpus()
save_corpuses()
elif re.match(r'(\w+)\/(\d+)',arg):
m=re.match(r'(\w+)\/(\d+)',arg)
sw=corpuses['lingspamplus']
sw.fontify_mail(m.group(1),int(m.group(2)))
elif isdir(arg):
for which in corpuses:
print "Doing %s corpus..."%which
corpuses[which].fontify_dir(arg,arg+'.spamly.'+which)
else:
corpuses['lingspamplus'].fontify_file(arg,None)
if __name__=='__main__':
main()