Python:StatNew.py
From Botwiki
A very basic routine, loads a number of new pages on Wikipedia in Cantonese, count the occurances of a few key characters, put the data as a list on a wikipage, together with some standardised suggestions. It is to develop the idea of to see how much we can know about an article from some simple statistics. Elegance is out of the question, for python and I are new to each other.
#statNew.py #SURVEY THE [[special:newpages|]] in yue.wikipedia import re import wikipedia import pagegenerators ### settings #import string #numberofpages= string.atoi(raw_input('number of pages to check, ignoring years?')) #datename='2007-11-01' datename = raw_input('name of sub-stat-page (date?) ') shortcut = raw_input('name of short-cut?') x = raw_input('number of pages to check? spacebar=10, (5)0, (1)50, (2)00, (3)00, (4)00, default=500?') if x==' ': numberofpages = 10 elif x=='5': numberofpages = 50 elif x=='1': numberofpages = 150 elif x=='2': numberofpages = 200 elif x=='3': numberofpages = 300 elif x=='4': numberofpages = 400 else : numberofpages = 500 repeat = False #falsing the wikipedia.Site.newpages(repeat) get_redirect= False # falsing the wikipedia.Site.newpages(get_redirect) ##PAGE LAYOUT, WIKITABLE line1=u';捷徑[['+shortcut+u']]\n新文統計 ([[user:R. Hillgentleman/statNew.py]]):~~~~\n' line2=u'{|class=\"wikitable sortable\"' line3=u'\n!文||字|| <nowiki>{{</nowiki> || <nowiki>[[</nowiki> ||嘅/的||係/是||唔/不||咗/了||' line4=u'\n' line6=u'\n|}' line7=u'\n==註==\n<references/>' line8=u'\n*不(?!屈|惜|朽|治|測)\n*(?<!目|中|麗)的(?!確|而)\n*(?<!為|國|於)是(?!但|非|為|故)' startofpage=line1+line2+line3+line4 endofpage=line6+line7+line8 text = startofpage #OUTPUT TEXT """ {|class="wikitable sortable" |- |d||d||e||g |- |e||tr||t||e |- |23||34||f||5 |} """ ############## REGEX ######## braces = re.compile(r'\{\{') brackets= re.compile(r'\[\[') ge = re.compile(ur'嘅',flags=re.U) hai = re.compile(ur'係',flags=re.U) nm = re.compile(ur'唔',flags=re.U) zo = re.compile(ur'咗',flags=re.U) dig = re.compile(ur'(?<!目|中|麗)的(?!確|而)',flags=re.U) si = re.compile(ur'(?<!為|國|於)是(?!但|非|為|故)',flags=re.U) bud = re.compile(ur'不(?!屈|惜|朽|治|測)',flags=re.U) liu = re.compile(ur'(?<!不)了(?!解|結)',flags=re.U) craplist = [(ge,dig),(hai,si),(nm,bud),(zo,liu)] ##################################### OPEN THE SITE ########### site=wikipedia.getSite() ############# GET LIST OF NEW PAGE ################# #FROM wikipedia.Site.newpages(number=10,get_redirect=False,repeat=False) """RETURNS A LIST OF: Page object, timestamp (unicode), length (int), an empty unicode string, username or IP address (str), comment (unicode). """ list = site.newpages(numberofpages,get_redirect,repeat) ## ANALYSE EACH PAGE for i,timestamp,length,empty,user,comment in list: if i.isRedirectPage(): continue t = i.title() if t.rfind(u'年')!= -1 : continue #IF PAGENAME CONTAINS THE WORD '年' y = i.get() length=len(y) #length FROM THE LIST IS CRAP line='\n|-\n|[['+ t + ']]||%d'%length +'||' #NEW LINE ### COUNT THE CRAP ########## #COUNT ALL THE DOUBLE BRACES AND, BY THE WAY, REPLACE THEM BY '' AND PUT IT IN CRAP crap, mo = braces.subn('',y) line += '%d'%mo + '||' #COUNT ALL THE DOUBLE BRACKETS AND, BY THE WAY, REPLACE THEM BY '' crap, link = brackets.subn('',y) line += '%d'%link + '||' #OTHER CRAP count=0 # RESET THE NON-CANTONESE COUNT for goodword,badword in craplist: crap, m = goodword.subn('',y) #COUNTING THE GOOD WORDS crap, n = badword.subn('',y) #COUNTING THE BAD WORDS if n!=0: q=m/n else: q=-1 if q==0: count+=1 # NON-CANTONESE COUNT line += '%d'%m +'/' + '%d'%n +'.=.'+ '%d'%q + '||' if count == 2 : line += u'唔似廣東話' if count == 3 : line += u'應該唔係廣東話' if count == 4 : line += u'唔係廣東話' if link ==0 and mo==0: line += u'<br/>要維基化' #NO LINK, NO TEMPLATE #ADD THE LINE TO THE TEXT text += line print(line) #END OF PAGE text += endofpage + u'[[Category:維基百科統計]]' ## OPEN THE DUMP PAGE sand = wikipedia.Page(site, ur'Wikipedia:統計/'+datename) # OR ur'wikipedia:\u6C99\u76D2' sand.put(text, u'新文統計: [[user:R. Hillgentleman/statNew.py]]') ##CREATE THE SHORTCUT short = wikipedia.Page(site, shortcut) #NAME OF SHORTCUT, INPUT FROM BEGINNING short.put(u'#REDIRECT [[Wikipedia:統計/'+datename+']]') wikipedia.stopme() ######################################### # SOME COMMENTED OUT CRAP # #ge = re.compile(ur'嘅') # or ur'\u5605' #br = re.compile(r'\{\{') #bl = re.compile(r'\}\}') #newstr , n = ge.subn('',text) # replace every ur'嘅' by empty string #newstr1 , n1= br.subn('',newstr) #newstr2 , n2= bl.subn('',newstr1) #wikipedia.output( 'the number of of GE in sandbox is: ') #print n #print ('numbers of {{,}}in sandbox are:') #print n1 , n2 #wikipedia.stopme() ############################################
BlogMarks
del.icio.us
digg
Fark
Furl
Newsvine
reddit
Segnalo
Simpy
Slashdot
smarking
Spurl
Wists
