Python:StatNew.py

From Botwiki

Jump to: navigation, search

A very basic routine, loads a number of new pages on Wikipedia in Cantonese, count the occurances of a few key characters, put the data as a list on a wikipage, together with some standardised suggestions. It is to develop the idea of to see how much we can know about an article from some simple statistics. Elegance is out of the question, for python and I are new to each other.

#statNew.py
#SURVEY THE [[special:newpages|]] in yue.wikipedia
 
import re
import wikipedia
import pagegenerators
 
### settings
#import string
#numberofpages= string.atoi(raw_input('number of pages to check, ignoring years?'))
#datename='2007-11-01'
 
datename = raw_input('name of sub-stat-page (date?) ')
shortcut = raw_input('name of short-cut?')
x = raw_input('number of pages to check? spacebar=10, (5)0, (1)50, (2)00, (3)00, (4)00, default=500?') 
if   x==' ': numberofpages = 10
elif x=='5': numberofpages = 50
elif x=='1': numberofpages = 150
elif x=='2': numberofpages = 200
elif x=='3': numberofpages = 300
elif x=='4': numberofpages = 400
else      : numberofpages = 500
 
repeat = False #falsing the wikipedia.Site.newpages(repeat)
get_redirect= False # falsing the wikipedia.Site.newpages(get_redirect)
 
##PAGE LAYOUT, WIKITABLE
line1=u';捷徑[['+shortcut+u']]\n新文統計 ([[user:R. Hillgentleman/statNew.py]]):~~~~\n'
line2=u'{|class=\"wikitable sortable\"'
line3=u'\n!文||字|| <nowiki>{{</nowiki> || <nowiki>[[</nowiki> ||嘅/的||係/是||唔/不||咗/了||'
line4=u'\n'
 
line6=u'\n|}'
line7=u'\n==註==\n<references/>'
line8=u'\n*不(?!屈|惜|朽|治|測)\n*(?<!目|中|麗)的(?!確|而)\n*(?<!為|國|於)是(?!但|非|為|故)'
 
startofpage=line1+line2+line3+line4
endofpage=line6+line7+line8
 
text = startofpage            #OUTPUT TEXT
 
 
"""
{|class="wikitable sortable"
|-
|d||d||e||g
|-
|e||tr||t||e
|-
|23||34||f||5
|}
"""
 
############## REGEX ########
braces  = re.compile(r'\{\{')
brackets= re.compile(r'\[\[')
 
ge  = re.compile(ur'嘅',flags=re.U)
hai = re.compile(ur'係',flags=re.U)
nm  = re.compile(ur'唔',flags=re.U)
zo  = re.compile(ur'咗',flags=re.U)
 
dig = re.compile(ur'(?<!目|中|麗)的(?!確|而)',flags=re.U)
si  = re.compile(ur'(?<!為|國|於)是(?!但|非|為|故)',flags=re.U)
bud = re.compile(ur'不(?!屈|惜|朽|治|測)',flags=re.U)
liu = re.compile(ur'(?<!不)了(?!解|結)',flags=re.U)
 
craplist = [(ge,dig),(hai,si),(nm,bud),(zo,liu)]
 
##################################### OPEN THE SITE ###########
site=wikipedia.getSite()
 
 
############# GET LIST OF NEW PAGE #################
#FROM wikipedia.Site.newpages(number=10,get_redirect=False,repeat=False)
"""RETURNS A LIST OF: Page object, timestamp (unicode), length (int), 
an empty unicode string, username or IP address (str), comment (unicode).
"""
 
list = site.newpages(numberofpages,get_redirect,repeat)
 
 
## ANALYSE EACH PAGE
for i,timestamp,length,empty,user,comment in list:
 if i.isRedirectPage(): continue
 t = i.title()
 if t.rfind(u'年')!= -1 : continue #IF PAGENAME CONTAINS THE WORD '年'
 y = i.get()
 
 length=len(y) #length FROM THE LIST IS CRAP
 
 line='\n|-\n|[['+ t + ']]||%d'%length +'||'      #NEW LINE
 
 ### COUNT THE CRAP ##########
 #COUNT ALL THE DOUBLE BRACES AND, BY THE WAY, REPLACE THEM BY '' AND PUT IT IN CRAP
 crap, mo = braces.subn('',y) 
 line += '%d'%mo + '||'
 
 #COUNT ALL THE DOUBLE BRACKETS AND, BY THE WAY, REPLACE THEM BY ''
 crap, link = brackets.subn('',y) 
 line += '%d'%link + '||'
 
 #OTHER CRAP
 count=0  # RESET THE NON-CANTONESE COUNT 
 for goodword,badword in craplist:
   crap, m = goodword.subn('',y)   	#COUNTING THE GOOD WORDS
   crap, n = badword.subn('',y)       #COUNTING THE BAD WORDS
   if n!=0:
     q=m/n
   else:
     q=-1
   if q==0: 
     count+=1  # NON-CANTONESE COUNT 
   line += '%d'%m +'/' + '%d'%n +'.=.'+ '%d'%q + '||'
 if count == 2 : line += u'唔似廣東話'
 if count == 3 : line += u'應該唔係廣東話'
 if count == 4 : line += u'唔係廣東話'
 if link ==0 and mo==0: line += u'<br/>要維基化' #NO LINK, NO TEMPLATE
 #ADD THE LINE TO THE TEXT
 text += line
 print(line)
#END OF PAGE
 
text += endofpage + u'[[Category:維基百科統計]]'
 
## OPEN THE DUMP PAGE
sand = wikipedia.Page(site, ur'Wikipedia:統計/'+datename) # OR ur'wikipedia:\u6C99\u76D2'
sand.put(text, u'新文統計: [[user:R. Hillgentleman/statNew.py]]')
 
 
##CREATE THE SHORTCUT
short = wikipedia.Page(site, shortcut) #NAME OF SHORTCUT, INPUT FROM BEGINNING
short.put(u'#REDIRECT [[Wikipedia:統計/'+datename+']]')
 
wikipedia.stopme() 
#########################################
# SOME COMMENTED OUT CRAP
#
#ge = re.compile(ur'嘅') # or ur'\u5605'
#br = re.compile(r'\{\{')
#bl = re.compile(r'\}\}')
#newstr , n = ge.subn('',text) # replace every ur'嘅' by empty string
#newstr1 , n1= br.subn('',newstr)
#newstr2 , n2= bl.subn('',newstr1)
#wikipedia.output( 'the number of of GE in sandbox is: ')
#print n
#print ('numbers of {{,}}in sandbox are:')
#print n1 , n2
#wikipedia.stopme() 
############################################
Personal tools