#!/usr/local/bin/python import sys import os import re import math import cPickle from utils import * from glob import glob def incr(hash,key): hash[key]=hash.get(key,0)+1 def getwords(fn,addto): #print "Read %s..."%fn f=open(fn) text=f.read() f.close() for word in text.split(): if len(word)<20: lw=word.lower() incr(addto,lw) incr(addto,'*') def spamcolor(x): "Maps x in [0,1] to ['00ff00' , 'ff0000']" return "%06x"% (int(255*(1.0-x))*0x000000 + int(255*x)*0x010000) class spamly_words: def fontify(self,text): headers,body=(text+'\n\n').split('\n\n',1) headers=re.sub(r'(?mi)^(X-|Errors-To|Reply-To|Date|Delivered-To|Content-|Message-Id|Delivery-Date|Return-Path|MIME-|Received|In-Reply-To|Sender)(.|\n\s)+\n','',headers+'\n') headers=re.sub(r'\*+SPAM\*+','',headers) body=re.sub(r'(?m)^SPAM: .*\n','',body) def colorword(m): word=m.group(1) return '%s' % (spamcolor(self.spamliness(word)),word) cbody=re.sub(r'\b(\w+)\b',colorword,body) return headers + '\n' + cbody def fontify_dir(self,indir,outdir): print "Fontifying %s -> %s"%(indir,outdir) try: os.mkdir(outdir) except: pass for infn in glob(indir+'/[0-9]*[0-9]'): outfn=infn.replace(indir,outdir) assert(outfn != infn) outfn+='.html' self.fontify_file(infn,outfn) def fontify_file(self,infn,outfn): infile=infn and open(infn) or sys.stdin intext=infile.read(60000) if len(intext)<50000: infile.close() outtext=self.fontify(intext) outfile=outfn and open(outfn,'w') or sys.stdout outfile.write(re.sub(r'\n','
',outtext)) outfile.close() def fontify_mail(self,box,number): self.fontify_file('/usr/tlb/Mail/%s/%d' % (box,number), '/usr/tlb/Mail/%s.spamly.%s/%d.html' % (box,self.name,number)) class spamly_words_inout(spamly_words): def __init__(self): self.spam={'*': 0} self.nonspam={'*': 0} def spamliness(self,word): lw=word.lower() num=float(self.spam.get(lw,0))/self.spam['*'] den=float(self.nonspam.get(lw,0)+2)/self.nonspam['*'] if num==0: return 0.0 return num/(num+den) class spamly_words_prob(spamly_words): def __init__(self): self.prob={'*': 0} def spamliness(self,word): lw=word.lower() return self.prob.get(lw,0.0) def build_lingspam_corpus(): print "Scanning lingspam_public/bare/part*..." sw=spamly_words_inout() for x in glob("lingspam_public/bare/part*/spm*.txt"): getwords(x,sw.spam) print "Spam: %d words known" % len(sw.spam) for x in glob("lingspam_public/bare/part*/[0-9]-*.txt"): getwords(x,sw.nonspam) print "Nonspam: %d words known" % len(sw.nonspam) return sw def build_lingspamplus_corpus(): print "Scanning lingspam_public/bare/part*..." sw=spamly_words_inout() for x in glob("lingspam_public/bare/part*/spm*.txt"): getwords(x,sw.spam) print "Spam: %d words known" % len(sw.spam) for x in glob("lingspam_public/bare/part*/[0-9]-*.txt"): getwords(x,sw.nonspam) print "Scanning /usr/tlb/Mail/inbox ..." for x in glob("/usr/tlb/Mail/inbox/[0-9]*"): getwords(x,sw.nonspam) print "Nonspam: %d words known" % len(sw.nonspam) return sw def build_archub_corpus(): print "Reading pglist..." sw=spamly_words_prob() f=open('pglist') text=f.read() f.close() fixword=re.compile(r'\|') for m in re.findall(r'\(((?:[\w\-]|(?:\|.*\|))+) ([\d\.]+)\)',text): word=re.sub(fixword,'',m[0]).lower() prob=m[1] sw.prob[word]=float(prob) return sw corpuses={} def save_corpuses(): global corpuses for x in corpuses: corpuses[x].name=x f=open('sc.dump','w') p=cPickle.Pickler(f,1) p.dump(corpuses) f.close() def load_corpuses(): global corpuses f=open('sc.dump') p=cPickle.Unpickler(f) corpuses=p.load() f.close() def isdir(path): return os.stat(path).st_mode&1 def main(): global corpuses load_corpuses() for arg in sys.argv[1:]: if arg=='-build-lingspam': corpuses['lingspam']=build_lingspam_corpus() save_corpuses() elif arg=='-build-lingspamplus': corpuses['lingspamplus']=build_lingspamplus_corpus() save_corpuses() elif arg=='-build-archub': corpuses['archub']=build_archub_corpus() save_corpuses() elif re.match(r'(\w+)\/(\d+)',arg): m=re.match(r'(\w+)\/(\d+)',arg) sw=corpuses['lingspamplus'] sw.fontify_mail(m.group(1),int(m.group(2))) elif isdir(arg): for which in corpuses: print "Doing %s corpus..."%which corpuses[which].fontify_dir(arg,arg+'.spamly.'+which) else: corpuses['lingspamplus'].fontify_file(arg,None) if __name__=='__main__': main()