Python:Import-it.py

From Botwiki

Jump to: navigation, search
#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
This is a script written by Filnik using the import class.
 
 
Guide for Italian's importers (for the other, the code is well commented ^_-)
 
Allora non ho molta voglia di programmare in modo "super-figo" quindi vi dovete
un po' arrrangiare :P in ogni caso, se volete cambiare la pagina da cui caricare
modificate pageLoad = '' con quello che volete (togliete l'ultimo ' e cercate quello)
se invece volete cambiare regex, cercate (appunto) regex. Per il resto, basta
mettere il file nella pagina selezionata (e cambiare la regex, se serve)
e dargli ok. Difficile? :-) Il bot tiene un log, in Utente:Filbot/Log per aggiornare
poi una pagina con tutti gli import fatti. Quindi non commentate quelle linee, pls!
 
"""
#
# (C) Filnik, 2007
#
# Greetings:
# Lorenzo Paulatto and Misza13
#
# Distributed under the terms of the MIT license.
#
# Version: 2.5
#
 
import wikipedia, re, time, pagegenerators
from pageimport import *
 
# Global variables
site = wikipedia.getSite(u'it', u'wiktionary')
 
def generator(raw):
    # This is the regex that loads the pages! Modify it if your
    # input page is formatted in another way
    regex = re.compile(r"\* ?[0-9]([0-9])? (.*?) .*? \[\[(.*?)\]\]", re.UNICODE)
    #regex = "\[\[()(.*?)\]\]"
    for m in regex.finditer(raw):
        pagetoload = m.group(3)
        yield wikipedia.Page(site, pagetoload)
 
# ################################################################ #
def main():
    wikipedia.output(u'\t\t\t  >> Start! <<')
    pos = 0
    parseList = list()
 
    # This is the page where the Bot will check to upload the file.
    pageLoad = u'Wikizionario:Importare/Lista'
    # What follow is used to load the page and get the pages to import
    page = wikipedia.Page(site, pageLoad)
    raw = page.get()
    # parseList has all the articles to import and the Bot import them one by one
    preloadingGen = pagegenerators.PreloadingGenerator(generator(raw), pageNumber=60)
    for oldpag in preloadingGen:
        pagetoload = oldpag.title()
        newpag = pagetoload.title().lower()
        # Searching if the name of the page has brackets (so the bot will
        # delete them and what is inside)
        if u'(' in newpag:
            newpag = re.sub(r'(.*?) ?\((.*?)\) ?(.*?)', r'\1\3', newpag)
        oldpag2 = wikipedia.Page(site, newpag)
        # Check that the Page doesn't still exist
        if not oldpag.exists():
            if not oldpag2.exists():
                importerbot = Importer(site) # Inizializing the Bot
                # defing the wikipedia site from what i will import the pages
                pediasite = wikipedia.getSite(u'it', u'wikipedia') 
                replacepage =  wikipedia.Page(pediasite, pagetoload)
                try:
                    replacetext = replacepage.get()
                except wikipedia.NoPage():
                    wikipedia.output(u"%s e' stata cancellata..." % pagetoload)
                    continue
                # This is the tag the must be in the page to be imported
                tag = [u'{{trasferimento', u'{{template:trasferimento']
                ok = False
                for w in tag:                    
                    if w in replacetext.lower():
                        ok = True
                if ok == True:
                    # Compiling the log...
                    if not report(pagetoload, newpag):
                        continue # If the result is false, continue.                    
                    while 1:
                        try:
                            domandami = False
                            if domandami == False:
                                wikipedia.output(u'Importing %s...' % pagetoload)
                            result = importerbot.Import(pagetoload, prompt = domandami)
                        except KeyError:
                            wikipedia.output(u'Error! The page has a strange character! skip!')
                            continue
                        else:
                            if result:
                                break
                            else:
                                wikipedia.output(u'Sleeping for 8 seconds and retry!')
                                time.sleep(8)
                                continue
                    # Putting the speedy delation tag in the wikipedia page
                    newtestreplace = u'{{Cancella subito|Pagina trasferita su wiktionary}}\n' #re.sub(r'\{\{[Tt]rasferimento\|(.*?)\}\}', r'{{TC|wikt|' + newpag + '}}',  replacetext)
                    replacepage.put(newtestreplace + replacetext, u'Bot: Pagina trasferita')
                else:
                    wikipedia.output(u"%s has no more the trasfer-template! Skip!" % newpag)
                    continue
                pag = wikipedia.Page(site, pagetoload)
                num = 0
                while 1:
                    try:
                        result1 = pag.move(newpag, reason = u'Bot: Sposto pagina da Maiuscola a minuscola')
                        break
                    except wikipedia.PageNotSaved:
                        result1 = True # Page already exist.. yes, it has the same name but in lowercase!
                        break
                    except:
                        if num != 3:
                            time.sleep(5)
                            continue
                        else:
                            break
                if result1 == False:
                    break
                    wikipedia.stopme()
                else:
                    wikipedia.output(u'Page moved successfully!')
                result2 = pag.delete(u'Bot: Cancello redirect inutile', False)
                if result2 == False:
                    break
                    wikipedia.stopme()
                # This block isn't already tested and may give errors so, be careful ^_-
                # It may happen that something (i really don't know what) go wrong and
                # the media-wiki software (I have checked, the page deleted hasn't written
                # who has deleted it) delete wrongly not the redirect but the page created.
                # This block will undelete the page and delete the right one.
                while 1:
                    try:
                        netxt = wikipedia.Page(site, newpag).get()
                        break
                    except wikipedia.NoPage:
                        wikipedia.output(u'Page Deleted! WARNING! BUG! Trying to solve the problem...')
                        wikipedia.Page(site, newpag).undelete(u'Bot: Ripesco pagina cancellata per errore dal software mediawiki.')
                        pag.delete(u'Bot: Cancello redirect inutile', False)
                        continue
                # Deleting the empty category that are in the wiktionary page...
                putxt = re.sub(r'\[\[[Cc]ategor(ia|y):(.*?)\]\](\n)?', r'',  netxt)
                wikipedia.Page(site, newpag).put(putxt, u'Bot: Tolgo categoria di wikipedia.')
                continue
            else:
                wikipedia.output(u"%s is already in wiktionary, checking the log..." % newpag)
                report(oldpag2.title(), newpag)
                continue                
        else:
            wikipedia.output(u"%s is already in wiktionary, checking the log..." % newpag)
            report(oldpag.title(), newpag)
            continue
 
def report(pagelog, newpag, com = u'Bot: Aggiungo pagina al log',
           site = wikipedia.getSite(u'it', u'wiktionary'), rep_page = u'Utente:Filbot/Log'):
    # I've used a function to report the username to a wiki-page.
    another_page = wikipedia.Page(site, rep_page)
    if another_page.exists():
        text_get = another_page.get()
    else:
        usr = config.usernames[u'wiktionary']
        text_get = u"This is a report page for the imported pages, please translate me. --[[User:%s|%s]]" % (usr[u'it'], usr[u'it'])
    pos = 0
    # The talk page includes "_" between the two names, in this way i replace them to " "
    regex = pagelog
    n = re.compile(regex, re.UNICODE)
    y = n.search(text_get, pos)
    if y == None:
        #Titolo pagina wikipediana", "codice progetto di destinazione", "motivo", "titolo di destinazione",
        # Adding the log :)
        rep_text = u'\n\n"%s", "wikt", ".", "%s",' % (pagelog, newpag)
        another_page.put(u'%s%s' % (text_get, rep_text), comment = com, minorEdit = True)
        wikipedia.output(u"...Reported...")
        return True
    else:
        pos = y.end()
        wikipedia.output(u"Already in the log...")
        return False # Error, return False
 
if __name__=='__main__':
    try:
        main()
    finally:
        wikipedia.stopme()
Personal tools