#ifexistsLogSorts.py# TO SORT THE FILE ifexists.log.1 AND EXTRACT lines like:# 2007-12-03 06:27:16 zhwiki: 131 http://zh.wikipedia.org/wiki/%E6%B4%9B%E7%A3%AF%E5%B1%B1%E8%84%88#FROM http://noc.wikimedia.org/~tstarling/ifexist.loglang='sw'print'ifexistsLogSorts.py - choose language'print'lang='+langx=raw_input('language?(sw)')ifx!='':lang=xdataFileName='ifexists.'+langx=raw_input('dataFileName? '+dataFileName)ifx!='':dataFileName=xprintdataFileNamea=''importreimportcodecs#import urllib#import time## REGEXurlX=re.compile(r'http\://'+lang+r'.+\b',flags=re.U)andX=re.compile(r'&.*\b',flags=re.U)#REMOVE &variant=zh-tw blablabla TAILshorturlX=re.compile(r'w/index\.php\?title=',flags=re.U)#CONVERT TO SHORTURL#file = urllib.urlopen('http://noc.wikimedia.org/~tstarling/ifexist.log')logFile=codecs.open('ifexists.log.1','a+',encoding='utf-8')dataFile=codecs.open(dataFileName,'w',encoding='utf-8')x=logFile.read()list=urlX.findall(x)"""n=0for i in list: print(i) n+=1print n, 'parses found.'"""s=set(list)"""n=0for i in s: print i n+=1print n, 'pages found.'"""a=raw_input('haha\n\n\n\n\n\a')## TREATING THE URLS AND HOPEFULLY FURTHER REDUCING THE SETs1=set([])foriins:i=andX.sub('',i)#REMOVE THE &... TAILi=shorturlX.sub('wiki/',i)#REPLACE BY SHORTURL #print is1.add(i)n=0foriins1:ifa=='':a=raw_input(i+'\npress return to continue, something else to automate')n+=1dataFile.write(i+'\n')printn,'urls in total.'dataFile.close()print'dataFile IS ',dataFileName### CRAP - UNLESS YOU HAVE VERY LITTLE MEMORY"""try: while True: x = file.read(100000) print x if a=='': a=raw_input('press Return to continue to wait, or press some other key to automatise') saveFile.write(x) currentTime=time.clock() while time.clock()< currentTime+10: print'..'finally: saveFile.close()"""#