Planning the future of Botwiki! - Help us bring Botwiki up to date, contribute to our strategy discussion, add bot scripts, and contribute manuals, guides, and tutorials! Almost anything related to bots, particularly those used to edit mediawiki, is welcome.
UNABLE TO EDIT? - We've experienced attacks by spambots lately and now require you to confirm your e-mail before you can edit (go to your preferences, enter an e-mail address, and request a confirmation e-mail, then go to your e-mail and click on the confirmation link). We also require new accounts to make a few edits and wait a few minutes before before you can create a page; however, if this is a problem contact us in #botwiki and we can manually confirm your account. Sorry for the inconvenience.
Python:Stampast.py
This script is used to process copyright violations in italian wikipedia: it takes the copied text and finds the first version of it in the history of page.
#!/usr/bin/python #! -*- coding: utf-8 -*- # Author: Lusum # Minor fixes: Filnik # License: GPL2 # Description: Program to parse the copyviol advises. """ The results are logged also in a file called stresult.txt You can run the bot with the following commandline parameters: -page - Only check a specific page of revertBot. Argument can also be given as "-page:pagetitle". -subpage - Examine only a specific section of revertBot -vers - Perform versions test analysis -noverbose - Less messages printed -from - Examine from a specific section of revertBot -to - Examine until a specific section of revertBot -auto - Auto perform text analysis -all - Exam all revertBot pages -text - Look for a particular text -cococo - Look for discussion page of cococo project -analysis - Analyze only urls and otrs -onlyurl - Analyze only urls -forceupdate - Force update of copyviol black and whitelist Examples: python stampast.py -subpage -auto -all #exam a particular subpage on all revertBot pages python stampast.py -auto -noverbose # exam the urls and the credit without examining the page python stampast.py -subpage #exam a particular subpage """ import wikipedia, catlib, difflib, urllib2 import re, sys, os, codecs, time import query from copyright import remove_wikicode, reports_cat, URLExclusion, read_file from copyright_put import get_stats #from wikipedia import fullVersionHistory import simplejson, config remove_wikicode_dotall = True simil = True wikipedia.setLogfileStatus(False) appdir = "copyright" def st_read_file( filename ): path = wikipedia.config.datafilepath(appdir, filename) wikipedia.config.makepath(path) pagetxt = read_file( path ) return pagetxt def split_params( param ): param = param.rstrip('\n') nomeparam = param.split('=')[0].lstrip(' ') conten = param.split('=')[1] return nomeparam, conten def getconten( param ): param = param.rstrip('\n') try: conten = param.split('=')[1] return conten except: return param def add_list_sure_results( title, motivo ): templista = [] templista.append( title ) templista.append( motivo ) listRisulSicuri.append( templista ) def load_pages(page, filename, force_load=False): path = wikipedia.config.datafilepath(appdir, filename) wikipedia.config.makepath(path) try: if not os.path.exists(path): print 'Creating file \'%s\' (%s)' % ( wikipedia.config.shortpath(path),page.title(asLink=True)) force_load = True else: file_age = time.time() - os.path.getmtime(path) if file_age > 24 * 60 * 60: print 'Updating file \'%s\' (%s)' % ( wikipedia.config.shortpath(path), page.title(asLink=True)) force_load = True except OSError: raise if force_load: data = None try: data = page.get() except KeyboardInterrupt: raise except wikipedia.IsRedirectPage, arg: data = page.getRedirectTarget().get() except: error('Getting page failed') if data: f = codecs.open(path, 'w', 'utf-8') f.write(data) f.close() return def text_similarity( text1 , text2 ): return difflib.SequenceMatcher(None, text1, text2).ratio() def output( text ): wikipedia.output( text ) def examine_user( author ): for user in user_list: if user: if user in author: output("\03{lightred}user suspected [[Utente:%s]] found \n" % ( author )) def search_text( page, listtext, lang, examuser = True): found = False similmax = 0 similversion = '' fullVersHist = page.fullVersionHistory() for text in listtext: output("\n==== Testo ====\n") output("%s\n" % text) for vers in fullVersHist: found = False #print vers[3].decode('utf-8') old = remove_wikicode(vers[3]) if simil: similvalue = text_similarity( old, text ) if similvalue > similmax: similmax = similvalue similversion = vers if text in old: output("Prima versione trovata\n") output("data) %s \n" % vers[1] ) output("autore) [[Utente:%s]]\n" % vers[2]) output("Link http://%s.wikipedia.org/w/index.php?title=%s&oldid=%s \ \n" % (lang,page.title().replace(" ","_").replace("\"", "%22"),vers[0])) #output("Testo) %s" % text.rstrip() ) #output("\03{lightyellow}***********") if examuser: examine_user( vers[1] ) found = True break if not found: output("testo non trovato\n") if simil: output("==== Similarity ====\n") output("ratio %s \n" % similmax) output("data) %s \n" % similversion[1]) output("autore) [[Utente:%s]]\n" % similversion[2]) output("Link http://%s.wikipedia.org/w/index.php?title=%s&oldid=%s \ \n" % (lang,page.title().replace(" ", "_").replace("\"","%22"),similversion[0])) #output("data) %s ora %s\n versione) %s autore) %s" %\ #(similversion[2], similversion[1],\ #similversion[0],similversion[3])) #print found def clean_engines(text): if not text: return "" text = re.sub('(.*)-', '', text) return text def check_list( listurl, clist, reason, printentry = False ): mlist = [] for entry in clist: if entry: for url in listurl: if entry in url: #output('%s, %s\n' % (reason, url)) ulist = [] ulist.append(reason) if printentry: url += " , entry " + entry ulist.append( url ) mlist.append( ulist ) #listurl.remove( url ) return mlist def check_exclusion_list( listurl, reason ): mlist = [] for url in listurl: if( URLexcl.check(url ) ): ulist = [] ulist.append(reason) ulist.append(url) mlist.append( ulist ) #listurl.remove( url ) return mlist def check_list_bool( listurl, clist ): for entry in clist: if entry: for url in listurl: if entry in url: return True return False def get_urls(pagetxt): pagetxt = re.sub(" ?#.*", "", pagetxt) pagetxt = re.sub("(?m)^\r?\n", "", pagetxt) result_list = re.sub("</?pre>", "", pagetxt ).splitlines() return result_list def get_user_list( pagetitle ): pagetxt = pagetitle.get() regexuser = re.compile("\{\{[U|u]ser\|(.*?)\}\}", re.S) mlist = regexuser.findall( pagetxt ) return mlist def get_cococo_list( pagetxt ): regexuser = re.compile("\{\{cococo\|utente\=(.*?)\|.*?\}\}", re.S) mlist = regexuser.findall( pagetxt ) return mlist def print_url_results( mlist ): for entry in mlist: if entry: output('%s, %s' % ( entry[0],entry[1]) ) def examine_url_regex( listurl ): mlist = [] for url in listurl: reglaw = re.compile(r'.*dlgs\d+_\d+\.html', re.S) reslist = reglaw.findall( url ) reglaw = re.compile(r'.*leggecomunitaria\d+.htm(l)?', re.S) reslist += reglaw.findall( url ) reglaw = re.compile(r'www\.camera\.it/parlam/leggi', re.S) reslist += reglaw.findall( url ) reglaw = re.compile(r'www\.giustizia\.it/cassazione/leggi', re.S) reslist += reglaw.findall( url ) reglaw = re.compile(r'www\.parlamento\.it/parlam/leggi', re.S) reslist += reglaw.findall( url ) if reslist: ulist = [] ulist.append( "\03{lightgreen}Probabile legge" ) ulist.append( url ) mlist.append( ulist ) return mlist def examine_site_url( listurl, blackl, pagetitle ): #output("=== Analisi urls ===") mlist = [] for urtmp in blackl: try: listurl.remove( urtmp ) except: error='' finally: ulist = [] ulist.append("\03{lightblue}<blacklist>") ulist.append(urtmp) mlist.append( ulist ) #controlla che negli url vi siano riferimenti al titolo della pagina pagetitle_list = pagetitle.split(' ') #tutto in lowercase tmplist = [word.lower() for word in pagetitle_list] pagetitle_list = tmplist #rimuovi le parole da una a tre lettere per evitare falsi positivi for word in pagetitle_list: if len(word) < 4: pagetitle_list.remove( word ) mlist += examine_url_regex( listurl ) mlist += check_list( listurl, pagetitle_list, "Nome voce presente nell'url", True ) #controlla che gli url non facciano parte della lista siti protetti mlist += check_list( listurl, protected_list, "\03{lightred}Sito protetto" ) #controlla che gli url non facciano parte della lista siti sospetti e #indica per quale motivo mlist += check_list( listurl, suspected_list, "\03{lightaqua}Sito sospetto", True ) mlist += check_exclusion_list( listurl, "\03{lightgreen}exclusion list" ) #if list: # list.insert( "=== Analisi urls ==", 0 ) return mlist def get_log_delete( title ): params = { 'action' :'query', 'list' :'logevents', 'letype' :'delete', 'letitle' :title, 'leprop' :'timestamp|comment|type', } results = query.GetData(params, useAPI = True, encodeTitle = False) events = results['query']['logevents'] return events def analisi_template( pagina, copyurl, pageevents ): for res in pageevents: comment = res['comment'] output( "Voce cancellata data %s, commento %s" % (res['timestamp'],comment) ) regcomm = re.compile(".*?(http://.*)[ |\n]?.*?", re.S) delurl = regcomm.findall( comment ) boolpres = check_list_bool( delurl, protected_list ) if not boolpres: addlist.extend(delurl) tlist = check_list( copyurl, delurl, "\03{lightred}Cancellata stesso sito" ) if len( tlist ) !=0: add_list_sure_results( pagina.title(), "Cancellata dallo stesso sito" ) print_url_results( tlist ) templatespagina = pagina.templatesWithParams() for t in templatespagina: low = t[0].lower() if low == 'w': output( "\03{lightred}Da wikificare" ) elif low == 'e': output( "\03{lightred}Probabile promozione" ) elif low == 't': output( "\03{lightgreen}In traduzione" ) elif low == 'cancellazione': output( "\03{lightred}In cancellazione" ) add_list_sure_results( pagina.title(), "In cancellazione" ) elif low == 'crediti': add_list_sure_results( pagina.title(), "OTRS" ) output( "\03{lightgreen}crediti trovati: sito %s, %s\n" % ( t[1][0], t[1][1] ) ) try: talk = pagina.toggleTalkPage() #output("discussione presente") templatestalk = talk.templatesWithParams() for ta in templatestalk: low = ta[0].lower() if low == 'scorporounione': output( "\03{lightgreen}%s, da pagina %s" % \ ( getconten( ta[1][0] ) , getconten( ta[1][1] ) ) ) elif low == 'noncancellata': output( "Salvata in cancellazione" ) add_list_sure_results( pagina.title(), "Salvata in cancellazione" ) elif low == 'crediti': add_list_sure_results( pagina.title(), "OTRS" ) output( "\03{lightgreen}crediti trovati: sito %s, %s\n" \ % ( ta[1][0], ta[1][1] ) ) elif low == 'tradotto da': output( "\03{lightgreen}tradotto da: wiki %s, voce %s\n"\ % ( ta[1][0], ta[1][1] ) ) except: notalk = True def analisi_traduzione( text ): lingue = { "inglese":"en", "francese":"fr", "tedesca": "de", "spagnola":"es", "russa":"ru", "portoghese":"pt"} tradlist = [] regextrad = re.compile("\{\{[t|T]\|.*?lingua\=(.*?)\|.*?\}\}", re.S) templtrad = regextrad.findall( text ) for numcode in templtrad: element = [] wikicode = lingue[numcode] expr = "\[\[" + wikicode + "\:" + "(.*?)\]\]" regexinterwiki = re.compile( expr, re.S) temp = regexinterwiki.findall( text ) for interwiki in temp: alternate = interwiki element.append( wikicode ) element.append( alternate ) break if element: tradlist.append( element ) return tradlist def controlla_pagina_da_cancellare( title, pageevents ): for res in pageevents: #print res comment = res['comment'] if 'semplificata' in comment: add_list_sure_results( title, "Cancellata in semplificata" ) if 'C4' in comment: add_list_sure_results( title, "Cancellata promozionale" ) if 'C13' or 'copyviol' or 'copyright' in comment: motivo = "Cancellata per violazione di copyright " add_list_sure_results( title, motivo ) def exampage( ris ): tradlist = [] wikicode = '' regex3 = re.compile("(http://.*?)[ |\n]", re.S) tmplisturl = regex3.findall( ris[2] ) listurl = list(set(tmplisturl)) regexsp = re.compile("\<blacklist\>(.*?)[ |\n]", re.S) blacks = regexsp.findall( ris[2] ) blackl = list(set(blacks)) title = ris[0] urlist = examine_site_url( listurl, blackl, title ) copyurl = listurl + blackl if not urlist and onlyurl: return output("\n== [[%s]] ==\n" % title ) if verbose: output("%s" % ris[1]) output("%s" % ris[2]) #print result if urlist: print_url_results( urlist ) if onlyurl: return pageevents = [] pageevents = get_log_delete( title ) controlla_pagina_da_cancellare( title, pageevents ) pagina = wikipedia.Page(wikipedia.getSite(), title ) analisi_template( pagina, copyurl, pageevents ) if notext: return pagtxt = pagina.get() output("=== Analisi pagina ===") tradlist = analisi_traduzione( pagtxt ) if not autost: choice = wikipedia.inputChoice\ ('Posso procedere ( a per altra wiki o altra voce )?', \ ['Si', 'No', 'Altra'], ['s', 'N', 'a'], 'N') regex2 = re.compile("\*\*(.*?)\n", re.S) listtextcopyviol = regex2.findall( ris[2] ) if ltext: listtextcopyviol = ltext if autost: choice = 'S' if choice in ['S', 's']: output("=== %s - %s ===" % (wikipedia.getSite(), title) ) search_text( pagina, listtextcopyviol, 'it') if tradlist: for element in tradlist: output("=== %s - %s ===" % (wikipedia.getSite( element[0] ), element[1]) ) pagina = wikipedia.Page(wikipedia.getSite( element[0] ), element[1]) search_text( pagina, listtextcopyviol, 'it') elif choice in ['A', 'a']: wikisc = wikipedia.input('Quale wiki [[en]] ?') if not wikisc: wikisc = 'en' pagin = 'Quale pagina? [[' + title + ']] ?' #print wikisc pagesc = wikipedia.input( pagin ) if not pagesc: pagesc = title pagina = wikipedia.Page( wikipedia.getSite( wikisc ), pagesc) search_text( pagina, listtextcopyviol, wikisc ) def get_result_list( text ): text += '\n===' #print text regex1 = re.compile\ ('=== ?\[\[(.*?)\]\] ?(.*?) ?===\n?(.*?(?=\=\=\=))', re.S) tmpresult = regex1.findall(text) result = tmpresult if subp: result = [ris for ris in tmpresult if ris[0] == subp] if frompage: getris = False result = [] for ris in tmpresult: if ris[0] == frompage: getris = True if topage: if ris[0] == topage: getris = False if getris: result.append( ris ) return result def get_count_revertbot( artlist ): count = 0 num = 0 #output( get_stats() ) for i in artlist: pagina = artlist[num] num += 1 try: reverttext = pagina.get() pagecount = len(re.findall('=== \[\[', reverttext)) count += pagecount except wikipedia.NoPage: output("%s doesn't exist!" % pagina.title()) output("Page %s, signalations %d" % ( i, pagecount )) output("Total signalations %d" % count ) sys.exit() def get_revertbot_list(): cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % wikipedia.translate(wikipedia.getSite(), reports_cat)) mlist = cat.articlesList() return mlist def get_pagina_revertbot( titolo ): if titolo == '': artlist = get_revertbot_list() num = 0 for i in artlist: num += 1 wikipedia.output("%d: %s" % (num, i)) wikipedia.output("%d: %s" % (num+1, "Conta segnalazioni")) scelta = wikipedia.input('Quale pagina vuoi esaminare?') if int(scelta) > len(artlist): get_count_revertbot( artlist ) pagina = artlist[int(scelta) - 1] else: pagina = wikipedia.Page(wikipedia.getSite(), titolo) return pagina def exam_list( result ): count = len( result ) num = 0 while num < len( result ): ris = result[num] try: exampage( ris ) except: output("\n") if not autost: choice = wikipedia.inputChoice\ (u'Continuo, Ripeti, o esci', ['C' ,'R', 'N'], ['C', 'r' ,'n' ], 'C') if choice not in ['C', 'c', 'R', 'r']: return if choice in ['R', 'r']: num -= 1 num += 1 def esamina_pagina( pagina ): try: reverttext = pagina.get() wikipedia.output(u'\n\t>>>Inizio a controllare!<<<\n') result = get_result_list( reverttext ) exam_list( result ) except wikipedia.NoPage: output("%s doesn't exist!" % pagina.title()) def load_data(force_update = False): global protected_list global suspected_list global user_list protected_list = [] suspected_list = [] user_list = [] #('it', 'Utente:Lusum/SitiProtetti', '../copyright/siti_protetti.txt'), #('it', 'Utente:Lusum/SitiSospetti', '../copyright/siti_sospetti'), #('it', 'Discussioni_progetto:Cococo', '../copyright/utenti_sospetti.txt'), #('it', 'Progetto:Cococo/controlli/lista', '../copyright/utenti_sospetti2.txt'), output("Load Data") output("Siti Protetti") pagetitle = wikipedia.Page(wikipedia.getSite("it"), "Utente:Lusum/SitiProtetti") filename = 'siti_protetti.txt' load_pages( pagetitle, filename , force_update ) pagetxt = st_read_file( filename ) protected_list = get_urls( pagetxt ) output("Siti Sospetti") pagesuspect = wikipedia.Page(wikipedia.getSite("it"), "Utente:Lusum/SitiSospetti") filename = 'siti_sospetti.txt' load_pages( pagesuspect, filename , force_update ) pagetxt = st_read_file( filename ) suspected_list = get_urls( pagetxt ) output("Utenti Sospetti") usercococo = wikipedia.Page(wikipedia.getSite("it"), "Progetto:Cococo/controlli/lista") filename = 'utenti_sospetti.txt' load_pages( usercococo, filename , force_update ) pagetxt = st_read_file( filename ) user_list = get_cococo_list( pagetxt ) user_list = [f for f in user_list if f != None] if cococo: usersusp = wikipedia.Page(wikipedia.getSite("it"), "Discussioni_progetto:Cococo") user_list += get_user_list( usersusp ) URLexcl.download( force_update ) URLexcl.scan() def main(): global simil global subp global frompage global topage global notext global verbose global autost global onlyurl global ltext global cococo global addlist global listRisulSicuri global URLexcl titolo = '' subp = '' frompage = '' topage = '' ltext = [] allist = [] addlist = [] listRisulSicuri = [] autost = False notext = True verbose = True onlyurl = False allpages = False force_update = False cococo = False URLexcl = URLExclusion() args = wikipedia.handleArgs() for arg in args: if arg == '-auto': autost = True elif arg == '-vers': notext = False elif arg == '-onlyurl': onlyurl = True verbose = False autost = True elif arg == '-noverbose': verbose = False elif arg == '-analysis': verbose = False autost = True elif arg.startswith('-page'): titolo = arg[len('-page:'):] elif arg.startswith('-subpage'): if len(arg) == len('-subpage'): subp = wikipedia.input(u'Choose a section to examine: ') else: subp = arg[len('-subpage:'):] elif arg.startswith('-frompage'): if len(arg) == len('-frompage'): frompage = wikipedia.input\ (u'Choose a page to start examination: ') else: frompage = arg[len('-topage:'):] elif arg.startswith('-topage'): if len(arg) == len('-topage'): topage = wikipedia.input\ (u'Choose a page to stop examination: ') else: topage = arg[len('-topage:'):] print topage elif arg == '-simil': simil = True elif arg == '-all': allpages = True elif arg.startswith('-text'): if len(arg) == len('-text'): stext = wikipedia.input(u'Choose a text to examine: ') else: stext = arg[len('-text:'):] ltext.append( stext ) elif arg.startswith('-forceupdate'): force_update = True elif arg.startswith('-cococo'): cococo = True else: output('incorrect arg %s' % arg) load_data( force_update ) if allpages: allist = get_revertbot_list() else: pagina = get_pagina_revertbot( titolo ) allist.append( pagina ) num = 0 for i in allist: pagina = allist[num] num += 1 output("\n\t%s\n" % i ) esamina_pagina( pagina ) #stampa le pagine sicuramente da escludere o cancellare if len( listRisulSicuri ) != 0: output('\nRisultati') for entry in listRisulSicuri: if entry: output('%s, %s' % ( entry[0],entry[1]) ) output("\nda aggiungere alla protected list\n") for i in addlist: output("%s" % i ) if __name__ == "__main__": try: main() finally: wikipedia.stopme()