Python:Stampast.py
From Botwiki
This script is used to process copyright violations in italian wikipedia: it takes the copied text and finds the first version of it in the history of page.
#!/usr/bin/python #! -*- coding: utf-8 -*- # Author: Lusum # Minor fixes: Filnik # License: GPL2 # Description: Program to parse the copyviol advises. """ The results are logged also in a file called stresult.txt You can run the bot with the following commandline parameters: -page - Only check a specific page of revertBot. Argument can also be given as "-page:pagetitle". -subpage - Examine only a specific section of revertBot -notext - Do not perform text analysis -noverbose - Less messages printed -from - Examine from a specific section of revertBot -to - Examine until a specific section of revertBot -auto - Auto perform text analysis -all - Exam all revertBot pages -text - Look for a particular text -cococo - Look for discussion page of cococo project -analysis - Analyze only urls and otrs -onlyurl - Analyze only urls -forceupdate - Force update of copyviol black and whitelist Examples: python stampast.py -subpage -auto -all #exam a particular subpage on all revertBot pages python stampast.py -auto -notext -noverbose # exam the urls and the credit without examining the page python stampast.py -subpage #exam a particular subpage """ import wikipedia, catlib, difflib, urllib2 import re, sys, os, codecs, time import query from copyright import remove_wikicode, reports_cat, URLExclusion, read_file from copyright_put import get_stats #from wikipedia import fullVersionHistory import simplejson, config remove_wikicode_dotall = True simil = True wikipedia.setLogfileStatus(False) appdir = "copyright" def st_read_file( filename ): path = wikipedia.config.datafilepath(appdir, filename) wikipedia.config.makepath(path) pagetxt = read_file( path ) return pagetxt def load_pages(page, filename, force_update=False): path = wikipedia.config.datafilepath(appdir, filename) wikipedia.config.makepath(path) try: force_load = force_update if not os.path.exists(path): print 'Creating file \'%s\' (%s)' % ( wikipedia.config.shortpath(path), page.aslink()) force_load = True else: file_age = time.time() - os.path.getmtime(path) if file_age > 24 * 60 * 60: print 'Updating file \'%s\' (%s)' % ( wikipedia.config.shortpath(path), page.aslink()) force_load = True except OSError: raise if force_load: data = None try: data = page.get() except KeyboardInterrupt: raise except wikipedia.IsRedirectPage, arg: data = page.getRedirectTarget().get() except: error('Getting page failed') if data: f = codecs.open(path, 'w', 'utf-8') f.write(data) f.close() return def text_similarity( text1 , text2 ): return difflib.SequenceMatcher(None, text1, text2).ratio() def output( text ): wikipedia.output( text ) def examine_user( author ): for user in user_list: if user: if user in author: output("\03{lightred}user suspected [[Utente:%s]] found \n" % ( author )) def search_text( page, listtext, lang, examuser = True): #print page found = False similmax = 0 similversion = '' #print found #ritornato alla vecchia versione, la nuova limita le versioni a 50, quella #vecchia almeno a 500 #listh = fullVersionHistory(page) listh = page.fullVersionHistory() #listh = page.getVersionHistory( False, False, True) #print listh # k = 0 for text in listtext: output("\n==== Testo ====\n") output("%s\n" % text) for i in listh: #pagetxt = page.getOldVersion( i[0] ) #k += 1 #print k #print i[2] #print i[3] found = False #old = remove_wikicode(i[2]) old = remove_wikicode(i[3].decode('utf-8')) if simil: similvalue = text_similarity( old, text ) if similvalue > similmax: similmax = similvalue similversion = i if text in old: output("Prima versione trovata\n") output("data) %s \n" % i[1] ) output("autore) [[Utente:%s]]\n" % i[2]) output("Link http://%s.wikipedia.org/w/index.php?title=%s&oldid=%s \ \n" % (lang,page.title().replace(" ", "_").replace("\"", "%22"),i[0])) #output("Testo) %s" % text.rstrip() ) #output("\03{lightyellow}***********") if examuser: examine_user( i[1] ) found = True break if not found: output("testo non trovato\n") if simil: output("==== Similarity ====\n") output("ratio %s \n" % similmax) output("data) %s \n" % similversion[1]) output("autore) [[Utente:%s]]\n" % similversion[2]) output("Link http://%s.wikipedia.org/w/index.php?title=%s&oldid=%s \ \n" % (lang,page.title().replace(" ", "_").replace("\"","%22"),similversion[0])) #output("data) %s ora %s\n versione) %s autore) %s" %\ #(similversion[2], similversion[1],\ #similversion[0],similversion[3])) #print found def clean_engines(text): if not text: return "" text = re.sub('(.*)-', '', text) return text def check_list( listurl, clist, reason, printentry = False ): mlist = [] for entry in clist: if entry: for url in listurl: if entry in url: #output('%s, %s\n' % (reason, url)) ulist = [] ulist.append(reason) if printentry: url += " , entry " + entry ulist.append(url) mlist.append( ulist ) #listurl.remove( url ) return mlist def check_exclusion_list( listurl, reason ): mlist = [] for url in listurl: if( URLexcl.check(url ) ): ulist = [] ulist.append(reason) ulist.append(url) mlist.append( ulist ) #listurl.remove( url ) return mlist def check_list_bool( listurl, clist ): for entry in clist: if entry: for url in listurl: if entry in url: return True return False def get_urls(pagetxt): pagetxt = re.sub(" ?#.*", "", pagetxt) pagetxt = re.sub("(?m)^\r?\n", "", pagetxt) result_list = re.sub("</?pre>", "", pagetxt ).splitlines() return result_list def get_user_list( pagetitle ): pagetxt = pagetitle.get() regexuser = re.compile("\{\{[U|u]ser\|(.*?)\}\}", re.S) mlist = regexuser.findall( pagetxt ) return mlist def get_cococo_list( pagetxt ): regexuser = re.compile("\{\{cococo\|utente\=(.*?)\|.*?\}\}", re.S) mlist = regexuser.findall( pagetxt ) return mlist def print_url_results( mlist ): for entry in mlist: if entry: output('%s, %s' % ( entry[0],entry[1]) ) def examine_url_regex( listurl ): mlist = [] for url in listurl: reglaw = re.compile(r'.*dlgs\d+_\d+\.html', re.S) reslist = reglaw.findall( url ) reglaw = re.compile(r'.*leggecomunitaria\d+.htm(l)?', re.S) reslist += reglaw.findall( url ) reglaw = re.compile(r'www\.camera\.it/parlam/leggi', re.S) reslist += reglaw.findall( url ) reglaw = re.compile(r'www\.giustizia\.it/cassazione/leggi', re.S) reslist += reglaw.findall( url ) reglaw = re.compile(r'www\.parlamento\.it/parlam/leggi', re.S) reslist += reglaw.findall( url ) if reslist: ulist = [] ulist.append( "\03{lightgreen}Probabile legge" ) ulist.append( url ) mlist.append( ulist ) return mlist def examine_site_url( listurl, blackl, pagetitle ): #output("=== Analisi urls ===") mlist = [] for urtmp in blackl: try: listurl.remove( urtmp ) except: error='' finally: ulist = [] ulist.append("\03{lightblue}<blacklist>") ulist.append(urtmp) mlist.append( ulist ) #controlla che negli url vi siano riferimenti al titolo della pagina pagetitle_list = pagetitle.split(' ') #tutto in lowercase tmplist = [word.lower() for word in pagetitle_list] pagetitle_list = tmplist #rimuovi le parole da una a tre lettere per evitare falsi positivi for word in pagetitle_list: if len(word) < 4: pagetitle_list.remove( word ) mlist += examine_url_regex( listurl ) mlist += check_list( listurl, pagetitle_list, "Nome voce presente nell'url" ) #controlla che gli url non facciano parte della lista siti protetti mlist += check_list( listurl, protected_list, "\03{lightred}Sito protetto" ) #controlla che gli url non facciano parte della lista siti sospetti e #indica per quale motivo mlist += check_list( listurl, suspected_list, "\03{lightaqua}Sito sospetto", True ) mlist += check_exclusion_list( listurl, "\03{lightgreen}exclusion list" ) #if list: # list.insert( "=== Analisi urls ==", 0 ) return mlist def get_log_delete( title ): params = { 'action' :'query', 'list' :'logevents', 'letype' :'delete', 'letitle' :title, 'leprop' :'timestamp|comment|type', } results = query.GetData(params, useAPI = True, encodeTitle = False) events = results['query']['logevents'] return events def analisi_template( pagina, pagetxt, copyurl ): templista = [] events = get_log_delete( pagina.title() ) for res in events: comment = res['comment'] output( "Voce cancellata data %s, commento %s" % (res['timestamp'],comment) ) regcomm = re.compile(".*?(http://.*)[ |\n]?.*?", re.S) delurl = regcomm.findall( comment ) boolpres = check_list_bool( delurl, protected_list ) if not boolpres: addlist.extend(delurl) tlist = check_list( copyurl, delurl, "\03{lightred}Cancellata stesso sito" ) if len( tlist ) !=0: templista.append( pagina.title() ) templista.append( "Cancellata dallo stesso sito" ) listRisulSicuri.append( templista ) print_url_results( tlist ) if pagina: regwiki = re.compile(r'\{\{([W|w]\|.*?)\}\}', re.S) reslist = regwiki.findall( pagetxt ) if reslist: for res in reslist: output( "\03{lightred}Da wikificare" ) regcanc = re.compile("\{\{([C|c]ancellazione)\}\}") reslist = regcanc.findall( pagetxt ) if len( reslist ) != 0: templista.append( pagina.title() ) templista.append( "In cancellazione" ) listRisulSicuri.append( templista ) for res in reslist: output( "\03{lightred}In cancellazione" ) regprom = re.compile("\{\{([E|e]\|.*?)\}\}", re.S) reslist = regprom.findall( pagetxt ) if reslist: for res in reslist: output( "\03{lightred}Probabile promozione" ) regtrad = re.compile(r'\{\{([T|t]\|.*?)\}\}', re.S) reslist = regtrad.findall( pagetxt ) if reslist: for res in reslist: output( "\03{lightgreen}In traduzione" ) regcrd = re.compile("\{\{[C|c]rediti\|(.*?)\|(.*?)\}\}", re.S) credts = regcrd.findall( pagetxt ) if len( credts ) !=0 : templista.append( pagina.title() ) templista.append( "OTRS" ) listRisulSicuri.append( templista ) for crres in credts: output( "\03{lightgreen}crediti trovati: sito %s, %s\n" % ( crres[0], crres[1] ) ) try: pagtlk = pagina.toggleTalkPage() pagtktxt = pagtlk.get() regscorp = \ re.compile(r'\{\{[S|s]corporoUnione\|(.*)\|(.*?)\|(.*?)\|(.*?)\}\}', re.S) reslist = regscorp.findall( pagtktxt ) if reslist: for res in reslist: output( "Scorporo/unione, da pagina %s" % res[1] ) regsalv = \ re.compile(r'\{\{([N|n]oncancellata)\|.*\}\}', re.S) reslist = regsalv.findall( pagtktxt ) if reslist: for res in reslist: output( "Salvata in cancellazione" ) templista.append( pagina.title() ) templista.append( "Salvata in cancellazione" ) listRisulSicuri.append( templista ) regcrd = re.compile("\{\{[C|c]rediti\|(.*?)\|(.*?)\}\}", re.S) credts = regcrd.findall( pagtktxt ) if len( credts ) !=0 : templista.append( pagina.title() ) templista.append( "OTRS" ) listRisulSicuri.append( templista ) for crres in credts: output( "\03{lightgreen}crediti trovati: sito %s, %s\n"\ % ( crres[0], crres[1] ) ) regtrad = re.compile(r'\{\{[T|t]radotto da\|(.*?)\|(.*?)[\|?|\}].*?\}?\}', re.S) credts = regtrad.findall( pagtktxt ) for crres in credts: output( "\03{lightgreen}tradotto da: wiki %s, voce %s\n"\ % ( crres[0], crres[1] ) ) except: notalk = True def analisi_traduzione( text ): lingue = { "inglese":"en", "francese":"fr", "tedesca": "de", "spagnola":"es", "russa":"ru", "portoghese":"pt"} tradlist = [] regextrad = re.compile("\{\{[t|T]\|.*?lingua\=(.*?)\|.*?\}\}", re.S) templtrad = regextrad.findall( text ) for numcode in templtrad: element = [] wikicode = lingue[numcode] expr = "\[\[" + wikicode + "\:" + "(.*?)\]\]" regexinterwiki = re.compile( expr, re.S) temp = regexinterwiki.findall( text ) for interwiki in temp: alternate = interwiki element.append( wikicode ) element.append( alternate ) break if element: tradlist.append( element ) return tradlist def pagina_da_cancellare( title ): templista = [] events = get_log_delete( title ) for res in events: comment = res['comment'] if 'semplificata' in comment: templista.append( title ) templista.append( "Cancellata in semplificata" ) listRisulSicuri.append( templista ) if 'C4' in comment: templista.append( title ) templista.append( "Cancellata promozionale" ) listRisulSicuri.append( templista ) def exampage( ris ): tradlist = [] wikicode = '' regex3 = re.compile("(http://.*?)[ |\n]", re.S) tmplisturl = regex3.findall( ris[2] ) listurl = list(set(tmplisturl)) regexsp = re.compile("\<blacklist\>(.*?)[ |\n]", re.S) blacks = regexsp.findall( ris[2] ) blackl = list(set(blacks)) urlist = examine_site_url( listurl, blackl, ris[0] ) copyurl = listurl + blackl if not urlist and onlyurl: return pagina_da_cancellare( ris[0] ) output("== [[%s]] ==\n" % ris[0]) if verbose: output("%s" % ris[1]) output("%s" % ris[2]) #print result if urlist: print_url_results( urlist ) if onlyurl: return pagina = wikipedia.Page(wikipedia.getSite(), ris[0] ) pagtxt = pagina.get() analisi_template( pagina, pagtxt, copyurl ) if notext: return output("=== Analisi pagina ===") tradlist = analisi_traduzione( pagtxt ) if not autost: choice = wikipedia.inputChoice\ ('Posso procedere ( a per altra wiki o altra voce )?', \ ['Si', 'No', 'Altra'], ['s', 'N', 'a'], 'N') regex2 = re.compile("\*\*(.*?)\n", re.S) listtextcopyviol = regex2.findall( ris[2] ) if ltext: listtextcopyviol = ltext if autost: choice = 'S' if choice in ['S', 's']: output("=== %s - %s ===" % (wikipedia.getSite(), ris[0]) ) search_text( pagina, listtextcopyviol, 'it') if tradlist: for element in tradlist: output("=== %s - %s ===" % (wikipedia.getSite( element[0] ), element[1]) ) pagina = wikipedia.Page(wikipedia.getSite( element[0] ), element[1]) search_text( pagina, listtextcopyviol, 'it') elif choice in ['A', 'a']: wikisc = wikipedia.input('Quale wiki [[en]] ?') if not wikisc: wikisc = 'en' pagin = 'Quale pagina? [[' + ris [0] + ']] ?' #print wikisc pagesc = wikipedia.input( pagin ) if not pagesc: pagesc = ris[0] pagina = wikipedia.Page( wikipedia.getSite( wikisc ), pagesc) search_text( pagina, listtextcopyviol, wikisc ) def get_result_list( text ): text += '\n===' #print text regex1 = re.compile\ ('=== ?\[\[(.*?)\]\] ?(.*?) ?===\n?(.*?(?=\=\=\=))', re.S) tmpresult = regex1.findall(text) result = tmpresult if subp: result = [ris for ris in tmpresult if ris[0] == subp] if frompage: getris = False result = [] for ris in tmpresult: if ris[0] == frompage: getris = True if topage: if ris[0] == topage: getris = False if getris: result.append( ris ) return result def get_count_revertbot( artlist ): count = 0 num = 0 #output( get_stats() ) for i in artlist: pagina = artlist[num] num += 1 try: reverttext = pagina.get() pagecount = len(re.findall('=== \[\[', reverttext)) count += pagecount except wikipedia.NoPage: output("%s doesn't exist!" % pagina.title()) output("Page %s, signalations %d" % ( i, pagecount )) output("Total signalations %d" % count ) sys.exit() def get_revertbot_list(): cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % wikipedia.translate(wikipedia.getSite(), reports_cat)) mlist = cat.articlesList() return mlist def get_pagina_revertbot( titolo ): if titolo == '': artlist = get_revertbot_list() num = 0 for i in artlist: num += 1 wikipedia.output("%d: %s" % (num, i)) wikipedia.output("%d: %s" % (num+1, "Conta segnalazioni")) scelta = wikipedia.input('Quale pagina vuoi esaminare?') if int(scelta) > len(artlist): get_count_revertbot( artlist ) pagina = artlist[int(scelta) - 1] else: pagina = wikipedia.Page(wikipedia.getSite(), titolo) return pagina def exam_list( result ): count = len( result ) num = 0 while num < len( result ): ris = result[num] try: exampage( ris ) except: output("\n") if not autost: choice = wikipedia.inputChoice\ (u'Continuo, Ripeti, o esci', ['C' ,'R', 'N'], ['C', 'r' ,'n' ], 'C') if choice not in ['C', 'c', 'R', 'r']: return if choice in ['R', 'r']: num -= 1 num += 1 def esamina_pagina( pagina ): try: reverttext = pagina.get() wikipedia.output(u'\n\t>>>Inizio a controllare!<<<\n') result = get_result_list( reverttext ) exam_list( result ) except wikipedia.NoPage: output("%s doesn't exist!" % pagina.title()) def load_data(force_update = False): global protected_list global suspected_list global user_list protected_list = [] suspected_list = [] user_list = [] #('it', 'Utente:Lusum/SitiProtetti', '../copyright/siti_protetti.txt'), #('it', 'Utente:Lusum/SitiSospetti', '../copyright/siti_sospetti'), #('it', 'Discussioni_progetto:Cococo', '../copyright/utenti_sospetti.txt'), #('it', 'Progetto:Cococo/controlli/lista', '../copyright/utenti_sospetti2.txt'), output("Load Data") output("Siti Protetti") pagetitle = wikipedia.Page(wikipedia.getSite("it"), "Utente:Lusum/SitiProtetti") filename = 'siti_protetti.txt' load_pages( pagetitle, filename , force_update ) pagetxt = st_read_file( filename ) protected_list = get_urls( pagetxt ) output("Siti Sospetti") pagesuspect = wikipedia.Page(wikipedia.getSite("it"), "Utente:Lusum/SitiSospetti") filename = 'siti_sospetti.txt' load_pages( pagesuspect, filename , force_update ) pagetxt = st_read_file( filename ) suspected_list = get_urls( pagetxt ) output("Utenti Sospetti") usercococo = wikipedia.Page(wikipedia.getSite("it"), "Progetto:Cococo/controlli/lista") filename = 'utenti_sospetti.txt' load_pages( usercococo, filename , force_update ) pagetxt = st_read_file( filename ) user_list = get_cococo_list( pagetxt ) user_list = [f for f in user_list if f != None] if cococo: usersusp = wikipedia.Page(wikipedia.getSite("it"), "Discussioni_progetto:Cococo") user_list += get_user_list( usersusp ) URLexcl.download( force_update ) URLexcl.scan() def main(): global simil global subp global frompage global topage global notext global verbose global autost global onlyurl global ltext global cococo global addlist global listRisulSicuri global URLexcl titolo = '' subp = '' frompage = '' topage = '' ltext = [] allist = [] addlist = [] listRisulSicuri = [] autost = False notext = False verbose = True onlyurl = False allpages = False force_update = False cococo = False URLexcl = URLExclusion() args = wikipedia.handleArgs() for arg in args: if arg == '-auto': autost = True elif arg == '-notext': notext = True elif arg == '-onlyurl': onlyurl = True verbose = False notext = True autost = True elif arg == '-noverbose': verbose = False elif arg == '-analysis': verbose = False notext = True autost = True elif arg.startswith('-page'): titolo = arg[len('-page:'):] elif arg.startswith('-subpage'): if len(arg) == len('-subpage'): subp = wikipedia.input(u'Choose a section to examine: ') else: subp = arg[len('-subpage:'):] elif arg.startswith('-frompage'): if len(arg) == len('-frompage'): frompage = wikipedia.input\ (u'Choose a page to start examination: ') else: frompage = arg[len('-topage:'):] elif arg.startswith('-topage'): if len(arg) == len('-topage'): topage = wikipedia.input\ (u'Choose a page to stop examination: ') else: topage = arg[len('-topage:'):] print topage elif arg == '-simil': simil = True elif arg == '-all': allpages = True elif arg.startswith('-text'): if len(arg) == len('-text'): stext = wikipedia.input(u'Choose a text to examine: ') else: stext = arg[len('-text:'):] ltext.append( stext ) elif arg.startswith('-forceupdate'): force_update = True elif arg.startswith('-cococo'): cococo = True else: output('incorrect arg %s' % arg) load_data( force_update ) if allpages: allist = get_revertbot_list() else: pagina = get_pagina_revertbot( titolo ) allist.append( pagina ) num = 0 for i in allist: pagina = allist[num] num += 1 output("\n\t%s\n" % i ) esamina_pagina( pagina ) #stampa le pagine sicuramente da escludere o cancellare if len( listRisulSicuri ) != 0: output('\nRisultati') for entry in listRisulSicuri: if entry: output('%s, %s' % ( entry[0],entry[1]) ) output("\nda aggiungere alla protected list\n") for i in addlist: output("%s" % i ) if __name__ == "__main__": try: main() finally: wikipedia.stopme()
BlogMarks
del.icio.us
digg
Fark
Furl
Newsvine
reddit
Segnalo
Simpy
Slashdot
smarking
Spurl
Wists
