Purple exclamation mark.svg Planning the future of Botwiki! - Help us bring Botwiki up to date, contribute to our strategy discussion, add bot scripts, and contribute manuals, guides, and tutorials! Almost anything related to bots, particularly those used to edit mediawiki, is welcome.

Red exclamation mark.svg UNABLE TO EDIT? - We've experienced attacks by spambots lately and now require you to confirm your e-mail before you can edit (go to your preferences, enter an e-mail address, and request a confirmation e-mail, then go to your e-mail and click on the confirmation link). We also require new accounts to make a few edits and wait a few minutes before before you can create a page; however, if this is a problem contact us in #botwiki and we can manually confirm your account. Sorry for the inconvenience.

Manual:Replacer.py

From Botwiki
Jump to: navigation, search
# -*- coding: utf-8 -*-
'''
Critical issues:
* max. 10 threads to not make too many edits and keep database connections below limit
* www.mediawiki.org must be supported
 
FIMEs:
*Right after getting the CheckUsage results, it start retrieving [[User:CommonsDelinker/replace-I18n]]. It does this multiple times for each site. That's quite inefficient.
'''
 
import wikipedia, config, codecs
import urllib2, re, time, thread
import MySQLdb 
 
months=['', "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
summaries={'default':u'[[w:commons:User:Orgullobot/commands|Bot]]: Replacing $1 with $2. [[m:User:CommonsDelinker|Translate me]] [[User:CommonsDelinker/replace-I18n|here]]!'}
done=[]
editing=[]#a list of pages the bot is currently editing/checking, to avoid edit conflicts with the threads
#Note: This is NOT a good way to do this.
existentes=[]
#a list of pages that we have checked if they exist, as to not check them over again.
def pageText(url):
	request=urllib2.Request(url)
	user_agent='Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.12) Gecko/20050915 Firefox/1.0.7'
	#print url
	request.add_header("User-Agent", user_agent)
	response=urllib2.urlopen(request)
	text=response.read()
	response.close()
	return text
 
def checanombres(page): #checanombres('es', 'wikipedia')
    dicc={}
    crudo=pageText('http://'+page.site().hostname()+'/w/api.php?action=query&prop=revisions&titles='+page.urlname()+'&rvprop=content&format=xml')
    carne=crudo.split('<page ')[1].split('>')[0]
    ns=carne.split('ns="')[1].split('"')[0]
    return int(ns)
 
 
pagelink=wikipedia.Page(wikipedia.Site('es', 'wikipedia'), 'Wikipedia:Putos')
wikipedia.output( u'ChecaNombres: '+str(checanombres(pagelink)) )
records=[]
def record(hora, page, img, new_image_name="NULL", status="ok"):
    records.append((hora, page, img, new_image_name, status))
    if len(records)>50:
        print 'Recording...'
        conn = MySQLdb.connect(host="sql",user="orgullo", passwd="****",db="u_orgullo_logs", charset='utf8', use_unicode=1)
        cursor=conn.cursor()
        cursor.execute('set names utf8;')
        for archivo in records:
            hora=archivo[0]
            page=archivo[1]
            img=archivo[2]
            new_image_name=archivo[3]
            status=archivo[4]
            rightnow=str(time.time())
            canIDB(rightnow)
            wikiT=str(page.site()).split(':')
            if wikiT[0]=='commons':
                wikiT=['wikimedia', 'commons']
            wiki=wikiT[1]+'.'+wikiT[0]+'.org'
            fortit=page.titleWithoutNamespace().replace(' ', '_')
            img=img.replace(' ', '_')
            img=conn.escape_string(img.encode('utf-8')).decode('utf-8')
            new_image_name=conn.escape_string(new_image_name.encode('utf-8')).decode('utf-8')
            fortit=conn.escape_string(fortit.encode('utf-8')).decode('utf-8')
            query=r"insert into delinker VALUES('"+hora+"','"+img+"', '"+wiki+"', '"+fortit+"', '"+str(checanombres(page))+"', '"+status+"', '"+new_image_name+"');"
            cursor.execute(query)
        conn.commit()
        conn.close()
 
 
        dbt=open('dbthrottle.txt', 'r')
        dbtt=dbt.read()
        dbt.close()
        newdbtt=dbtt.replace(rightnow+'\n', '')
        dbtW=open('./dbthrottle.txt', 'w')
        dbtW.write(newdbtt)
        dbtW.close()
        while records != []:
            records.remove(records[0])
 
#record(time.time(), wikipedia.Page(wikipedia.getSite(), 'This is a test'), 'Testimage.jpg')
#'%Y-%m-%d %H:%M
#'2006-09-22 21:01'
 
def exists(page):
    """This is much more efficient for the servers"""
    #http://es.wikipedia.org/w/query.php?what=content&titles=Image:Punta%20del%20Este.jpg&aplimit=1&format=xml
    if page in existentes:
        return True
    path='http://'+page.site().hostname()+'/w/query.php?what=imageinfo&titles='+page.urlname()+'&aplimit=1&format=xml'
    crudo=pageText(path)
    identi=crudo.split('<id>')[1].split('</id>')[0]
    espacio=crudo.split('<ns>')[1].split('</ns>')[0]
    if identi != "0":
        if espacio=="6":
            if not '<image ' in crudo:
                return False
        existentes.append(page)
        return True
    else:
        return False
 
def getcommands():
    uni=[]
    restored=[]
    lo=codecs.open('commons-commands.txt', 'r', 'utf-8')
    lotxt=lo.read()
    lo.close()
    return lotxt
 
def canIedit():
    if '{{stop}}' in getcommands().lower():
        return False
    else:
        return True
def canIDB(rightnow):
    try:
        canIgo=False
        while canIgo==False:
            dbthrottleCheck=open('dbthrottle.txt', 'r')
            dbthrottleCheckTxt=dbthrottleCheck.read()
            dbthrottleCheck.close()
            if dbthrottleCheckTxt.count('\n')<10:
                dbthrottleA=open('dbthrottle.txt', 'a')
                dbthrottleA.write(rightnow+'\n')
                return True
            time.sleep(10)
    except IOError:
        time.sleep(10)
 
def summary(wiki_site):
    try:
        if wiki_site in summaries:
            if time.time()-summaries[wiki_site][1]<3600:##reload the summary if it's over an hour old
                return summaries[wiki_site][0]
        pl=wikipedia.Page(wiki_site, u'User:CommonsDelinker/replace-I18n')
        try:
            x=pl.get()
            summaries[wiki_site]=[x, time.time()]
            return x
        except wikipedia.NoPage:
            if not 'wikipedia' in str(wiki_site):
                lang=str(wiki_site).split(':')[-1]
                if lang in ('incubator', 'meta', 'commons', 'species'):
                    new_site=wikipedia.Site('en', 'wikipedia')
                else:
                    new_site=wikipedia.Site(lang, 'wikipedia')
                return summary(new_site)
            summaries[wiki_site]=[summaries['default'], time.time()]
            return summaries['default']
    except:
        return summaries['default']
def replace_image(img, pg, newimg):
    tocon='a'*14
    if canIedit()==False:
        return None
    print ('Replacing image', img, pg, pg.site(), newimg)
    fix=wikipedia.Page(pg.site(), img)
    img=fix.titleWithoutNamespace()
    ext1=img.split('.')[-1]
    ext2=newimg.split('.')[-1]
    print 'Extensions: '+ext1+', '+ext2
    if ext2.lower()=='svg':
        if ext1.lower() !='svg':
            print 'Ignoring non-SVG to SVG replacement.'
            return None
    newimg=wikipedia.Page(pg.site(), newimg).titleWithoutNamespace()
    while pg in editing:
        time.sleep(3)
    editing.append(pg)
    msg=summary(pg.site())
    msg=msg.replace('$1', img)
    msg=msg.replace('$2', newimg)
 
    imagen=pg.site().namespace(6)
    wikipedia.output(pg.title())
    if pg.namespace() not in [99999999]:
        txt=pg.get()
        newTxt=txt
        if pg.site() != wikipedia.Site('commons', 'commons'):
            ce=wikipedia.Page(pg.site(), 'Image:'+img)
            if exists(ce):
                print 'Pulling out'
                return None
        forpat=img
        toescape=('.', '(', ')')
        for te in toescape:
            forpat=forpat.replace(te, '\\'+te)
        rx=r'['+img[0].upper()+forpat[0].lower()+']'+forpat[1:]
        if ' ' in rx:
            rx=rx.replace(' ', '[ _]')
        elif '_' in rx:
            rx=rx.replace('_', '[ _]')
        print [rx]
 
        posis=re.findall(rx, newTxt)
        print posis
        for posi in posis:
            newTxt=newTxt.replace(posi, newimg)
 
 
        if txt != newTxt:
            try:
                ##We want to make sure the userpage is not empty
                filename='canedit.cdl'
                f=codecs.open(filename, 'r', 'utf-8')
                ftxt=f.read()
                f.close()
                if not '#'+str(pg.site()) in ftxt:
                    userpage=wikipedia.Page(pg.site(), 'User:CommonsDelinker')
                    if not exists(userpage):
                        userpage.put('#Redirect[[m:User:CommonsDelinker]]', '')
                    f=codecs.open(filename, 'a', 'utf-8')
                    f.write('#'+str(pg.site()))
                    f.close()
                wikipedia.showDiff(txt, newTxt)
                pg.put(newTxt, msg)
                thread.start_new_thread(record, (tocon, pg, img, newimg, "ok"))
            except wikipedia.LockedPage:
                thread.start_new_thread(record, (tocon, pg, img, newimg, "failed"))
                print 'Page is locked' 
        else:
            #thread.start_new_thread(record, (tocon, pg, img, newimg, "skipped"))
            wikipedia.output( u'No match: '+pg.site().hostname()+'/wiki/'+pg.urlname() )
    while pg in editing:
        editing.remove(pg)
def checkUsage(image, newimg):
    print ('check usage', image, newimg)
    imageU=wikipedia.Page(wikipedia.getSite(), image).urlname()
    path='http://tools.wikimedia.de/%7Edaniel/WikiSense/CheckUsage.php?i='+imageU+'&w=_100000#end'
    ch=pageText(path).decode('utf-8')
    projs=ch.split("class='project'")[1:]
    print (str(len(projs))+ u' projects for', image)
    for proj in projs:
        baseR=ur'wik(?:i[mp]edia|ibooks|tionary|iquote|inews|isource|iversity)'
        proid=re.findall('http://([^\.]*\.'+baseR+'\.org)', proj)[0]
        #couldbe=('<i><b>different image', '<i>found:', '<i>local duplicate')
        #found=0
        #for could in couldbe:
        #    if could in proj:
        #        found=1
        #if found==1:
        #   #going on, this project has a local copy
        #    continue
        tabla=proj.split("<div class='page'>")[1:]
        wikipedia.output(u'Checkusage returns '+str(len(tabla))+' for '+image+' in '+proid+'.')
        for ta in tabla:
            ta=ta.split('</div>')[0]
            url=ta.split('<a href="')[1].split('?uselang=en"')[0]
            wpR=ur'http://(commons|incubator|meta|species|www|[^\.]*)\.('+baseR+')\.org/wiki/(.*)'
            wikipedia.output( wpR )
            wp=re.findall(wpR, url)
            print wp
            if wp != []:
                if wp[0][0]==u'commons':
                    wp=['commons', 'commons', wp[0][2]]
                elif wp[0][0]==u'meta':
                    wp=['meta', 'meta', wp[0][2]]
                elif wp[0][0]==u'incubator':
                    wp=['incubator', 'incubator', wp[0][2]]
                elif wp[0][0]==u'www':
                    wp=['mediawiki', 'mediawiki', wp[0][2]]
                elif wp[0][0]==u'species':
                    wp=['species', 'species', wp[0][2]]
                else:
                    wp=wp[0]
                pagelink=wikipedia.Page(wikipedia.Site(wp[0], wp[1]), wp[2])
                try:
                    user=config.usernames[wp[1]][wp[0]]
                    #retirar_imagen(image, pagelink, admin)
                    while pagelink in editing:
                        time.sleep(3)
                    thread.start_new_thread(replace_image, (image, pagelink, newimg))
                except KeyError:
			continue
 
hechas=[]
fiables=['User:Orgullobot/commands']
def RUN():
    for fiable in fiables:
       cmd=wikipedia.Page(wikipedia.Site('commons', 'commons'), fiable)
       texto=cmd.get()
       lfile=codecs.open('./commons-commands.txt', 'w', 'utf-8')
       lfile.write(texto)
       lfile.close()
    texto=getcommands()
    chuletas=texto.split('{{')
    chuletas.remove(chuletas[0])
    for chuleta in chuletas:
        if chuleta in hechas:
            chuletas.remove(chuleta)
            continue
        else:
            hechas.append(chuleta)
 
            com=chuleta.split('|')[0]
            if com.lower() in ['universal replace', 'universal_replace']:
                img=chuleta.split('|')[1]
                newimg=chuleta.split('|')[2].split('}}')[0]
		thread.start_new_thread(checkUsage, (img, newimg))
                time.sleep(5)
		#checkUsage(img, newimg)
while 2==2:
    RUN()
    time.sleep(60)
Personal tools
Share