Purple exclamation mark.svg Planning the future of Botwiki! - Help us bring Botwiki up to date, contribute to our strategy discussion, add bot scripts, and contribute manuals, guides, and tutorials! Almost anything related to bots, particularly those used to edit mediawiki, is welcome.

Red exclamation mark.svg UNABLE TO EDIT? - We've experienced attacks by spambots lately and now require you to confirm your e-mail before you can edit (go to your preferences, enter an e-mail address, and request a confirmation e-mail, then go to your e-mail and click on the confirmation link). We also require new accounts to make a few edits and wait a few minutes before before you can create a page; however, if this is a problem contact us in #botwiki and we can manually confirm your account. Sorry for the inconvenience.

Python:Stampast.py

From Botwiki
Jump to: navigation, search

This script is used to process copyright violations in italian wikipedia: it takes the copied text and finds the first version of it in the history of page.

download

#!/usr/bin/python
#! -*- coding: utf-8 -*-
 
# Author: Lusum
# Minor fixes: Filnik
# License: GPL2
# Description: Program to parse the copyviol advises.
"""
The results are logged also in a file called stresult.txt
You can run the bot with the following commandline parameters:
 
-page        - Only check a specific page of revertBot.
               Argument can also be given as "-page:pagetitle". 
-subpage     - Examine only a specific section of revertBot
-vers         - Perform versions test analysis
-noverbose   - Less messages printed
-from        - Examine from a specific section of revertBot
-to          - Examine until a specific section of revertBot
-auto        - Auto perform text analysis
-all         - Exam all revertBot pages
-text        - Look for a particular text
-cococo      - Look for discussion page of cococo project
-analysis    - Analyze only urls and otrs
-onlyurl     - Analyze only urls
-forceupdate - Force update of copyviol black and whitelist
 
 
Examples:
	python stampast.py -subpage -auto -all   #exam a particular subpage on all revertBot pages
 
	python stampast.py -auto -noverbose # exam the urls and the credit without examining the page
 
	python stampast.py -subpage  #exam a particular subpage
"""
 
import wikipedia, catlib, difflib, urllib2
import re, sys, os, codecs, time
import query
from copyright import remove_wikicode, reports_cat, URLExclusion, read_file
from copyright_put import get_stats
#from wikipedia import fullVersionHistory
 
import simplejson, config
 
remove_wikicode_dotall = True
 
simil = True
 
wikipedia.setLogfileStatus(False)
 
appdir = "copyright"
 
def st_read_file( filename ):
 
    path = wikipedia.config.datafilepath(appdir, filename)
    wikipedia.config.makepath(path)
 
    pagetxt = read_file( path )
    return pagetxt
 
def split_params( param ):
    param = param.rstrip('\n')
    nomeparam = param.split('=')[0].lstrip(' ')
    conten = param.split('=')[1]
    return nomeparam, conten
 
def getconten( param ):
    param = param.rstrip('\n')
    try:
        conten = param.split('=')[1]
        return conten
    except: 
        return param
 
def add_list_sure_results( title, motivo ):
    templista = []
    templista.append( title )
    templista.append( motivo )
    listRisulSicuri.append( templista )
 
 
def load_pages(page, filename, force_load=False):
 
    path = wikipedia.config.datafilepath(appdir, filename)
    wikipedia.config.makepath(path)
 
    try:
        if not os.path.exists(path):
            print 'Creating file \'%s\' (%s)' % ( 
                    wikipedia.config.shortpath(path),page.title(asLink=True))
            force_load = True
        else:
            file_age = time.time() - os.path.getmtime(path)
            if file_age > 24 * 60 * 60:
                print 'Updating file \'%s\' (%s)' % (
                        wikipedia.config.shortpath(path), page.title(asLink=True))
                force_load = True
    except OSError:
            raise
 
    if force_load:
        data = None
        try:
            data = page.get()
        except KeyboardInterrupt:
            raise
        except wikipedia.IsRedirectPage, arg:
           data = page.getRedirectTarget().get()
        except:
           error('Getting page failed')
 
        if data:
            f = codecs.open(path, 'w', 'utf-8')
            f.write(data)
            f.close()
    return
 
def text_similarity( text1 , text2 ):
    return difflib.SequenceMatcher(None, text1, text2).ratio()
 
def output( text ):
    wikipedia.output( text )
 
def examine_user( author ):
 
    for user in user_list:
        if user:
            if user in author:
                output("\03{lightred}user suspected [[Utente:%s]] found \n"
                        % ( author ))
 
 
def search_text( page, listtext, lang, examuser = True):
    found = False
    similmax = 0
    similversion = ''
 
    fullVersHist = page.fullVersionHistory()
 
    for text in listtext:
        output("\n==== Testo ====\n")
        output("%s\n" % text)
        for vers in fullVersHist:
            found = False
            #print vers[3].decode('utf-8')
            old = remove_wikicode(vers[3])
            if simil:
                similvalue = text_similarity( old, text )
                if similvalue > similmax:
                    similmax = similvalue
                    similversion = vers
            if text in old:
                output("Prima versione trovata\n")
                output("data) %s \n" % vers[1] )
                output("autore) [[Utente:%s]]\n" % vers[2])
                output("Link http://%s.wikipedia.org/w/index.php?title=%s&oldid=%s \
                        \n" % (lang,page.title().replace(" ","_").replace("\"", "%22"),vers[0]))
                #output("Testo) %s" % text.rstrip() )
                #output("\03{lightyellow}***********")
                if examuser:
                    examine_user( vers[1] )
                found = True
                break
        if not found:
            output("testo non trovato\n")
 
        if simil:
            output("==== Similarity ====\n")
            output("ratio %s \n" % similmax)
            output("data) %s \n" % similversion[1])
            output("autore) [[Utente:%s]]\n" % similversion[2])
            output("Link http://%s.wikipedia.org/w/index.php?title=%s&oldid=%s \
                    \n" % (lang,page.title().replace(" ", "_").replace("\"","%22"),similversion[0]))
            #output("data) %s ora %s\n versione) %s  autore) %s" %\
                    #(similversion[2], similversion[1],\
                    #similversion[0],similversion[3]))
 
    #print found 
 
def clean_engines(text):
    if not text:
        return ""
    text = re.sub('(.*)-', '', text)
    return text
 
def check_list( listurl, clist, reason, printentry = False ):
    mlist = []
    for entry in clist:
        if entry:
            for url in listurl:
                if entry in url:
                    #output('%s, %s\n' % (reason, url))
                    ulist = []
                    ulist.append(reason)
                    if printentry:
                        url += " , entry " + entry
                    ulist.append( url )
                    mlist.append( ulist )
                    #listurl.remove( url )
 
    return mlist
 
 
def check_exclusion_list( listurl, reason ):
    mlist = [] 
 
    for url in listurl:
        if( URLexcl.check(url ) ):
                ulist = []
                ulist.append(reason)
                ulist.append(url)
                mlist.append( ulist )
                #listurl.remove( url )
    return mlist
 
 
 
def check_list_bool( listurl, clist ):
    for entry in clist:
        if entry:
            for url in listurl:
                if entry in url:
                    return True
    return False
 
def get_urls(pagetxt):
 
    pagetxt = re.sub(" ?#.*", "", pagetxt)
    pagetxt = re.sub("(?m)^\r?\n", "", pagetxt)
 
    result_list = re.sub("</?pre>", "", pagetxt ).splitlines()
    return result_list
 
def get_user_list( pagetitle ):
    pagetxt = pagetitle.get()
 
    regexuser = re.compile("\{\{[U|u]ser\|(.*?)\}\}", re.S)
    mlist = regexuser.findall( pagetxt )
    return mlist
 
def get_cococo_list( pagetxt ):
 
    regexuser = re.compile("\{\{cococo\|utente\=(.*?)\|.*?\}\}", re.S)
    mlist = regexuser.findall( pagetxt )
    return mlist
 
def print_url_results( mlist ):
    for entry in mlist:
        if entry:
            output('%s, %s' % ( entry[0],entry[1]) )
 
def examine_url_regex( listurl ):
    mlist = []
 
    for url in listurl:
        reglaw = re.compile(r'.*dlgs\d+_\d+\.html', re.S)
        reslist = reglaw.findall( url )
        reglaw = re.compile(r'.*leggecomunitaria\d+.htm(l)?', re.S)
        reslist += reglaw.findall( url )
        reglaw = re.compile(r'www\.camera\.it/parlam/leggi', re.S)
        reslist += reglaw.findall( url )
        reglaw = re.compile(r'www\.giustizia\.it/cassazione/leggi', re.S)
        reslist += reglaw.findall( url )
        reglaw = re.compile(r'www\.parlamento\.it/parlam/leggi', re.S)
        reslist += reglaw.findall( url )
        if reslist:
            ulist = []
            ulist.append( "\03{lightgreen}Probabile legge" )
            ulist.append( url )
            mlist.append( ulist )
 
    return mlist
 
 
def examine_site_url( listurl, blackl, pagetitle ):
 
    #output("=== Analisi urls ===")
 
    mlist = []
 
    for urtmp in blackl:
        try: 
            listurl.remove( urtmp )
        except:
            error=''
        finally:
            ulist = []
            ulist.append("\03{lightblue}<blacklist>")
            ulist.append(urtmp)
            mlist.append( ulist )
 
    #controlla che negli url vi siano riferimenti al titolo della pagina
    pagetitle_list = pagetitle.split(' ')
 
    #tutto in lowercase
    tmplist = [word.lower() for word in pagetitle_list]
 
    pagetitle_list = tmplist
    #rimuovi le parole da una a tre lettere per evitare falsi positivi
    for word in pagetitle_list:
        if len(word) < 4:
            pagetitle_list.remove( word )
 
    mlist += examine_url_regex( listurl ) 
 
    mlist += check_list( listurl, pagetitle_list, "Nome voce presente nell'url", True )
 
    #controlla che gli url non facciano parte della lista siti protetti
    mlist += check_list( listurl, protected_list, "\03{lightred}Sito protetto" ) 
 
    #controlla che gli url non facciano parte della lista siti sospetti e
    #indica  per quale motivo
    mlist += check_list( listurl, suspected_list, "\03{lightaqua}Sito sospetto", True ) 
 
    mlist += check_exclusion_list( listurl, "\03{lightgreen}exclusion list" )
 
    #if list:
    #    list.insert( "=== Analisi urls ==", 0 )
 
    return mlist
 
def get_log_delete( title ):
 
    params = {
        'action'    :'query',
        'list'      :'logevents',
        'letype'    :'delete',
        'letitle'    :title,
        'leprop'    :'timestamp|comment|type',
    }
 
    results = query.GetData(params, useAPI = True, encodeTitle = False)
 
    events = results['query']['logevents']
 
    return events
 
def analisi_template( pagina, copyurl, pageevents ):
 
    for res in pageevents:
        comment = res['comment']
        output( "Voce cancellata data %s, commento %s" %
                (res['timestamp'],comment) )
 
        regcomm = re.compile(".*?(http://.*)[ |\n]?.*?", re.S)
        delurl = regcomm.findall( comment )
        boolpres = check_list_bool( delurl, protected_list ) 
        if not boolpres:
            addlist.extend(delurl)
        tlist = check_list( copyurl, delurl, "\03{lightred}Cancellata stesso sito" ) 
        if len( tlist ) !=0:
            add_list_sure_results( pagina.title(), "Cancellata dallo stesso sito"  )
            print_url_results( tlist )
 
    templatespagina = pagina.templatesWithParams()
 
    for t in templatespagina:
        low = t[0].lower()
        if low == 'w':
            output( "\03{lightred}Da wikificare" )
        elif low == 'e':
            output( "\03{lightred}Probabile promozione" )
        elif low == 't':
            output( "\03{lightgreen}In traduzione" )
        elif low == 'cancellazione':
            output( "\03{lightred}In cancellazione" )
            add_list_sure_results( pagina.title(), "In cancellazione"  )
        elif low == 'crediti':
            add_list_sure_results( pagina.title(), "OTRS"  )
            output( "\03{lightgreen}crediti trovati: sito %s, %s\n"
                    % ( t[1][0], t[1][1] ) )
 
    try:
        talk = pagina.toggleTalkPage()
 
        #output("discussione presente")
 
        templatestalk = talk.templatesWithParams()
 
        for ta in templatestalk:
            low = ta[0].lower()
            if low == 'scorporounione':
                output( "\03{lightgreen}%s, da pagina %s" %  \
                        ( getconten( ta[1][0] ) , getconten( ta[1][1] )  ) )
            elif low == 'noncancellata':
                output( "Salvata in cancellazione" )
                add_list_sure_results( pagina.title(), "Salvata in cancellazione" ) 
            elif low == 'crediti':
                add_list_sure_results( pagina.title(), "OTRS"  )
                output( "\03{lightgreen}crediti trovati: sito %s, %s\n" \
                    % ( ta[1][0], ta[1][1] ) )
            elif low == 'tradotto da':
                output( "\03{lightgreen}tradotto da: wiki %s, voce %s\n"\
                    % ( ta[1][0], ta[1][1] ) )
    except:
        notalk = True
 
def analisi_traduzione( text ):
 
    lingue = { "inglese":"en", "francese":"fr", "tedesca": "de",
            "spagnola":"es", "russa":"ru", "portoghese":"pt"}
    tradlist = []
 
    regextrad = re.compile("\{\{[t|T]\|.*?lingua\=(.*?)\|.*?\}\}", re.S)
    templtrad = regextrad.findall( text )
 
    for numcode in templtrad:
        element = []
 
        wikicode = lingue[numcode]
 
        expr = "\[\[" + wikicode + "\:" + "(.*?)\]\]"
        regexinterwiki = re.compile( expr, re.S)
        temp = regexinterwiki.findall( text )
 
        for interwiki in temp:
            alternate = interwiki
            element.append( wikicode )
            element.append( alternate )
            break
        if element:
            tradlist.append( element )
 
    return tradlist
 
def controlla_pagina_da_cancellare( title, pageevents ):
 
    for res in pageevents:
        #print res
        comment = res['comment']
        if 'semplificata' in comment:
            add_list_sure_results( title, "Cancellata in semplificata"  )
        if 'C4' in comment:
            add_list_sure_results( title, "Cancellata promozionale"  )
        if 'C13' or 'copyviol' or 'copyright' in comment:
            motivo = "Cancellata per violazione di copyright "
            add_list_sure_results( title, motivo )
 
 
def exampage( ris ):
 
    tradlist = []
    wikicode = ''
 
    regex3 = re.compile("(http://.*?)[ |\n]", re.S)
    tmplisturl = regex3.findall( ris[2] )
    listurl = list(set(tmplisturl))
 
    regexsp = re.compile("\<blacklist\>(.*?)[ |\n]", re.S)
    blacks = regexsp.findall( ris[2] )
    blackl = list(set(blacks))
    title = ris[0]
 
    urlist = examine_site_url( listurl, blackl, title )
 
    copyurl = listurl + blackl
 
    if not urlist and onlyurl:
        return
 
    output("\n== [[%s]] ==\n" % title )
    if verbose:
        output("%s" % ris[1])
        output("%s" % ris[2])
    #print result
 
    if urlist:
        print_url_results( urlist )
 
    if onlyurl:
        return
 
    pageevents = []
    pageevents = get_log_delete( title )
 
    controlla_pagina_da_cancellare( title, pageevents )
 
    pagina = wikipedia.Page(wikipedia.getSite(), title )
 
    analisi_template( pagina, copyurl, pageevents )
 
    if notext:
        return
 
    pagtxt = pagina.get()
 
    output("=== Analisi pagina ===")
 
    tradlist = analisi_traduzione( pagtxt )
 
    if not autost:
        choice = wikipedia.inputChoice\
                ('Posso procedere ( a per altra wiki o altra voce )?', \
                ['Si', 'No', 'Altra'], ['s', 'N', 'a'], 'N')
 
    regex2 = re.compile("\*\*(.*?)\n", re.S)
    listtextcopyviol = regex2.findall( ris[2] )
 
    if ltext:
        listtextcopyviol = ltext
 
    if autost:
        choice = 'S'
    if choice in ['S', 's']:
        output("=== %s - %s ===" % (wikipedia.getSite(), title) )
        search_text( pagina, listtextcopyviol, 'it')
        if tradlist:
            for element in tradlist:
                output("=== %s - %s ===" % (wikipedia.getSite( element[0] ),
                    element[1]) )
                pagina = wikipedia.Page(wikipedia.getSite( element[0] ),
                        element[1])
                search_text( pagina, listtextcopyviol, 'it')
    elif choice in ['A', 'a']:
        wikisc = wikipedia.input('Quale wiki [[en]] ?')
        if not wikisc:
            wikisc = 'en'
        pagin = 'Quale pagina? [[' + title + ']] ?'
        #print wikisc
        pagesc = wikipedia.input( pagin )
        if not pagesc:
            pagesc = title
        pagina = wikipedia.Page( wikipedia.getSite( wikisc ), pagesc)
        search_text( pagina, listtextcopyviol, wikisc )
 
def get_result_list( text ):
 
    text += '\n==='
    #print text
 
    regex1 = re.compile\
            ('=== ?\[\[(.*?)\]\] ?(.*?) ?===\n?(.*?(?=\=\=\=))', re.S)
 
    tmpresult = regex1.findall(text)
    result = tmpresult
 
    if subp:
        result = [ris for ris in tmpresult if ris[0] == subp]
 
    if frompage:
        getris = False
        result = []
        for ris in tmpresult:
            if ris[0] == frompage:
                getris = True
            if topage:
                if ris[0] == topage:
                    getris = False
            if getris:
                result.append( ris )
 
    return result
 
 
def get_count_revertbot( artlist ):
    count = 0
    num = 0
    #output( get_stats() )
    for i in artlist:
        pagina = artlist[num]
        num += 1
        try:
            reverttext = pagina.get()
            pagecount = len(re.findall('=== \[\[', reverttext))
            count += pagecount
        except wikipedia.NoPage:
            output("%s doesn't exist!" % pagina.title())
        output("Page %s, signalations %d" % ( i, pagecount ))
 
    output("Total signalations %d" % count )
 
    sys.exit()
 
def get_revertbot_list():
    cat = catlib.Category(wikipedia.getSite(),
        'Category:%s' % wikipedia.translate(wikipedia.getSite(),
        reports_cat))
    mlist = cat.articlesList()
 
    return mlist
 
def get_pagina_revertbot( titolo ):
 
    if titolo == '':
        artlist = get_revertbot_list()
        num = 0
        for i in artlist:
            num += 1
            wikipedia.output("%d: %s" % (num, i))
        wikipedia.output("%d: %s" % (num+1, "Conta segnalazioni"))
        scelta = wikipedia.input('Quale pagina vuoi esaminare?')
 
        if int(scelta) > len(artlist):
            get_count_revertbot( artlist )
 
        pagina = artlist[int(scelta) - 1]
    else:
        pagina = wikipedia.Page(wikipedia.getSite(), titolo)
 
    return pagina
 
 
def exam_list( result ):
 
    count = len( result )
    num = 0
    while num < len( result ):
        ris = result[num]
        try: 
            exampage( ris )
        except:
            output("\n")
 
        if not autost:
            choice = wikipedia.inputChoice\
                (u'Continuo, Ripeti, o esci',  ['C' ,'R', 'N'], ['C',
                'r' ,'n' ], 'C')
            if choice not in ['C', 'c', 'R', 'r']:
                return
            if choice in ['R', 'r']:
                num -= 1
        num += 1
 
def esamina_pagina( pagina ):
    try:
        reverttext = pagina.get()
 
        wikipedia.output(u'\n\t>>>Inizio a controllare!<<<\n')
 
        result = get_result_list( reverttext )
 
        exam_list( result )
 
    except wikipedia.NoPage:
        output("%s doesn't exist!" % pagina.title())
 
 
def load_data(force_update = False):
    global protected_list
    global suspected_list
    global user_list
 
 
    protected_list = []
    suspected_list = []
    user_list = []
 
    #('it', 'Utente:Lusum/SitiProtetti', '../copyright/siti_protetti.txt'),
    #('it', 'Utente:Lusum/SitiSospetti', '../copyright/siti_sospetti'),
    #('it', 'Discussioni_progetto:Cococo', '../copyright/utenti_sospetti.txt'),
    #('it', 'Progetto:Cococo/controlli/lista', '../copyright/utenti_sospetti2.txt'),
 
    output("Load Data")
    output("Siti Protetti")
    pagetitle = wikipedia.Page(wikipedia.getSite("it"),
            "Utente:Lusum/SitiProtetti")
 
    filename = 'siti_protetti.txt'
 
    load_pages( pagetitle, filename , force_update )
 
    pagetxt = st_read_file( filename )
    protected_list = get_urls( pagetxt )
 
    output("Siti Sospetti")
    pagesuspect = wikipedia.Page(wikipedia.getSite("it"),
            "Utente:Lusum/SitiSospetti")
 
    filename = 'siti_sospetti.txt'
 
    load_pages( pagesuspect, filename , force_update )
 
    pagetxt = st_read_file( filename )
    suspected_list = get_urls( pagetxt )
 
    output("Utenti Sospetti")
 
    usercococo = wikipedia.Page(wikipedia.getSite("it"),
            "Progetto:Cococo/controlli/lista")
 
    filename = 'utenti_sospetti.txt'
 
    load_pages( usercococo, filename , force_update )
 
    pagetxt = st_read_file( filename )
 
    user_list = get_cococo_list( pagetxt )
    user_list = [f for f in user_list if f != None]
 
 
    if cococo:
        usersusp = wikipedia.Page(wikipedia.getSite("it"),
            "Discussioni_progetto:Cococo")
        user_list += get_user_list( usersusp )
 
    URLexcl.download( force_update )
    URLexcl.scan()
 
def main():
    global simil
    global subp
    global frompage
    global topage
    global notext
    global verbose
    global autost
    global onlyurl
    global ltext
    global cococo
    global addlist
    global listRisulSicuri
    global URLexcl 
 
    titolo = ''
    subp = ''
    frompage = ''
    topage = ''
    ltext = []
    allist = []
    addlist = []
    listRisulSicuri = []
    autost = False
    notext = True
    verbose = True
    onlyurl = False
    allpages = False
    force_update = False
    cococo = False
    URLexcl = URLExclusion()
 
    args = wikipedia.handleArgs()
 
    for arg in args:
        if arg == '-auto':
            autost = True
        elif arg == '-vers':
            notext = False
        elif arg == '-onlyurl':
            onlyurl = True
            verbose = False
            autost = True
        elif arg == '-noverbose':
            verbose = False
        elif arg == '-analysis':
            verbose = False
            autost = True
        elif arg.startswith('-page'):
            titolo = arg[len('-page:'):]
        elif arg.startswith('-subpage'):
            if len(arg) == len('-subpage'):
                subp = wikipedia.input(u'Choose a section to examine: ')
            else:
                subp = arg[len('-subpage:'):]
        elif arg.startswith('-frompage'):
            if len(arg) == len('-frompage'):
                frompage = wikipedia.input\
                        (u'Choose a page to start examination: ')
            else:
                frompage = arg[len('-topage:'):]
        elif arg.startswith('-topage'):
            if len(arg) == len('-topage'):
                topage = wikipedia.input\
                        (u'Choose a page to stop examination: ')
            else:
                topage = arg[len('-topage:'):]
            print topage
        elif arg == '-simil':
            simil = True
        elif arg == '-all':
            allpages = True
        elif arg.startswith('-text'):
            if len(arg) == len('-text'):
                stext = wikipedia.input(u'Choose a text to examine: ')
            else:
                stext = arg[len('-text:'):]
            ltext.append( stext )
        elif arg.startswith('-forceupdate'):
            force_update = True
        elif arg.startswith('-cococo'):
            cococo = True
        else:
            output('incorrect arg %s' % arg)
 
    load_data( force_update )
 
    if allpages:
        allist = get_revertbot_list()
    else:
        pagina = get_pagina_revertbot( titolo )
        allist.append( pagina )
 
    num = 0
    for i in allist:
        pagina = allist[num]
        num += 1
        output("\n\t%s\n" % i )
        esamina_pagina( pagina )
 
 
    #stampa le pagine sicuramente da escludere o cancellare 
 
    if len( listRisulSicuri ) != 0:
        output('\nRisultati')
    for entry in listRisulSicuri:
        if entry:
            output('%s, %s' % ( entry[0],entry[1]) )
 
    output("\nda aggiungere alla protected list\n")
    for i in addlist:
        output("%s" % i )
 
 
if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()
Personal tools
Share