Planning the future of Botwiki! - Help us bring Botwiki up to date, contribute to our strategy discussion, add bot scripts, and contribute manuals, guides, and tutorials! Almost anything related to bots, particularly those used to edit mediawiki, is welcome.
UNABLE TO EDIT? - We've experienced attacks by spambots lately and now require you to confirm your e-mail before you can edit (go to your preferences, enter an e-mail address, and request a confirmation e-mail, then go to your e-mail and click on the confirmation link). We also require new accounts to make a few edits and wait a few minutes before before you can create a page; however, if this is a problem contact us in #botwiki and we can manually confirm your account. Sorry for the inconvenience.
Python:Archive index.py
#!/usr/bin/python # -*- coding: utf-8 -*- """ Generates indexes of archived talk pages. The following parameters are supported: -debug If given, doesn't do any real changes, but only shows what would have been changed. -log Writes output to logfile -page:pagename Create an index only on this page. Otherwise all pages which transclude the hometemplate will be processed. -logbook:pagename Write a log to this page -defaulttemplate:pagename Default template to use -hometemplate:pagename page which is transcluded to generate the index. *** This is required! *** """ __version__ = '$Id$' import wikipedia import pagegenerators import re import sys import zlib from time import strftime, localtime from operator import itemgetter # This is required for the text that is shown when you run this script # with the parameter -help. docuReplacements = { } # contains handy static functions class TextFunctions: # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/145672/index_txt def formatBlock(block): '''Format the given block of text, trimming leading/trailing empty lines and any leading whitespace that is common to all lines. The purpose is to let us list a code block as a multiline, triple-quoted Python string, taking care of indentation concerns.''' # separate block into lines lines = str(block).split('\n') # remove leading/trailing empty lines while lines and not lines[0]: del lines[0] while lines and not lines[-1]: del lines[-1] # look at first line to see how much indentation to trim ws = re.match(r'\s*',lines[0]).group(0) if ws: lines = map( lambda x: x.replace(ws,'',1), lines ) # remove leading/trailing blank lines (after leading ws removal) # we do this again in case there were pure-whitespace lines while lines and not lines[0]: del lines[0] while lines and not lines[-1]: del lines[-1] return '\n'.join(lines)+'\n' formatBlock = staticmethod(formatBlock) def getanchor(sectiontext): ''' get the anchor link of a section based on the title on the dutch wikipedia it is not enough to call wikipedia.sectionencode ''' # remove [[ ]] around sectionname anchor=TextFunctions.removeformatting(sectiontext) anchor = wikipedia.sectionencode(anchor,wikipedia.getSite().encoding()) # remove spaces at start and end (which are converted to underscores) while anchor[:1]=='_': anchor = anchor[1:] while anchor[-1:]=='_': anchor = anchor[:-1] return anchor getanchor = staticmethod(getanchor) def removeformatting(linktext): ''' remove [[ ]] and '' from a string, convert to text which would be shown ''' p1 = re.compile(r'\[\[ [^|\]]* \|( [^|\]]* ) \]\]', re.VERBOSE) linktext = p1.sub(r'\1',linktext) p2 = re.compile(r'\[\[ ( [^\]]* ) \]\]', re.VERBOSE) linktext = p2.sub(r'\1',linktext) linktext=re.sub(r"'''(.*)'''",r'\1',linktext) linktext=re.sub(r"''(.*)''",r'\1',linktext) return linktext removeformatting = staticmethod(removeformatting) class Templates: ''' Used to generate a table from an index a template is defined as a python dictionary use get to load a template from a wikipedia page use parsetemplate to create a template from a string use processindex to create a textual table of an index based on a template ''' def __init__(self): self.templates = {} self.default='default' self.templates[self.default]=self.getdefaulttemplate() def loadpage(self, name): if name is None: return page = wikipedia.Page(wikipedia.getSite(), name) try: text = page.get() except (wikipedia.NoPage, wikipedia.IsRedirectPage): return self.templates[name]=self.parsetemplate(text) def get(self, name): if name not in self.templates: self.loadpage(name) if name in self.templates: return self.templates[name] return self.templates[self.default] def getdefaulttemplate(self): text=TextFunctions.formatBlock(''' <!-- HEADER --> {| class="sortable" ! Onderwerp !! Link <!-- ROW --> |- | %%topic%% || [[%%link%%|%%page%%]] <!-- ALT ROW --> |- style="background: #dddddd;" | %%topic%% || [[%%link%%|%%page%%]] <!-- FOOTER --> |} <!-- END --> ''') return self.parsetemplate(text) def parsetemplate(self,text): section='' ret={} seperators = re.compile(r'(<!--[^-]*-->)') seperators2 = re.compile(r'<!--([^-]*)-->') parts = seperators.split(text) for part in parts: t2 = seperators2.match(part) if t2 is not None: section=t2.group(1).strip() else: ret[section]=part.strip()+'\n' return ret def processindex(self,template,index): if len(index) == 0: return '' ret='' count=0 if 'LEAD' in template: ret += template['LEAD'] if 'HEADER' in template: ret += template['HEADER'] for r in index: count += 1 if (count%2==0) and ('ALT ROW' in template): t = template['ALT ROW'] else: t = template['ROW'] t = t.replace('%%topic%%',r['topic']) t = t.replace('%%link%%',r['link']) t = t.replace('%%page%%',r['page']) ret += t if 'FOOTER' in template: ret += template['FOOTER'] if 'TAIL' in template: ret += template['TAIL'] # allow %%subst%% and %%now%% to be replaced in all headers, not just ROW ret = ret.replace('%%subst%%','subst:') ret = ret.replace('%%now%%',strftime("%d %b %Y %H:%M (%Z)")) ret = ret.replace('%%((%%','{{') ret = ret.replace('%%))%%','}}') ret = ret.replace('%%(%%','{') ret = ret.replace('%%)%%','}') return ret class IndexGenerator: ''' Create an archive index of a numbor of pages an index is a list of dictonaries with the following keys sortkey : lowercase text usefull for sorting link : link to page page : title of the (sub) page topic : title of section readoptions is used to read an optionstring getoptionstring to return the current options addpage is an internal function to process a single page retrieve is used to generate the index using the previous set options ''' def __init__(self): self.pages=[] self.globaloptions={} pass def setoption(self,name,value): if value is None: if name in self.globaloptions: del self.globaloptions[name] else: self.globaloptions[name] = str(value) def readoption(self,name): if name in self.globaloptions: return self.globaloptions[name] return None def changedchecksum(self,checksum): if ('checksum' in self.globaloptions) and (str(self.globaloptions['checksum']) == str(checksum)): return False self.globaloptions['checksum'] = str(checksum) return True def readoptions(self,txt,pagename): options=txt.split(';') for option in options: opt=option.split('=',2) if len(opt)==2: if opt[0] in ('page', 'pageprefix'): self.pages.append({opt[0]:opt[1]}) elif (opt[0] in ('name','include','exclude')) and (len(self.pages) != 0): self.pages[-1][opt[0]] = opt[1] elif opt[0] in ('checksum','template'): self.globaloptions[opt[0]] = opt[1] else: wikipedia.output('unknown/invalid option: %s=%s' % (opt[0],opt[1])) if len(self.pages) == 0: self.pages.append({'pageprefix':pagename+'/'}) def getoptionstring(self): ret='' if 'template' in self.globaloptions: ret += 'template=%s;' % self.globaloptions['template'] for indexpage in self.pages: if 'page' in indexpage: ret += 'page=%s;name=%s;' % (indexpage['page'], indexpage['name']) elif 'pageprefix' in indexpage: ret += 'pageprefix=%s;' % indexpage['pageprefix'] if 'name' in indexpage: ret += 'name=%s;' % indexpage['name'] if 'include' in indexpage: ret += 'include=%s;' % indexpage['include'] elif 'exclude' in indexpage: ret += 'exclude=%s;' % indexpage['exclude'] if 'checksum' in self.globaloptions: ret += 'checksum=%s;' % self.globaloptions['checksum'] return ret def addpage(self, page, shortname): pagetitle = page.title() try: text = page.get() except (wikipedia.NoPage, wikipedia.IsRedirectPage): wikipedia.output('error get()') return [] # \n is enough... text=re.sub('\r','',text) ret = [] lasttitle2="" lasttitle3="" lasttitle4="" title2 = re.compile(r'^==\ *([^= ].*[^= ])\ *==$') title3 = re.compile(r'^===\ *([^= ].*[^= ])\ *===$') title4 = re.compile(r'====\ *([^= ].*[^= ])\ *====') alltitles = re.compile(r'^(==.*==)$',re.MULTILINE) parts = alltitles.split(text) for part in parts: t2 = title2.match(part) t3 = title3.match(part) t4 = title4.match(part) if t2 is not None: lasttitle2=t2.group(1) lasttitle3="" elif t3 is not None: lasttitle3=t3.group(1) lasttitle4="" elif t4 is not None: lasttitle4=t4.group(1) else: if lasttitle4 != "": lasttitle4="" elif lasttitle3 != "": #wikipedia.output(u"===%s===" % lasttitle3) anchor = TextFunctions.getanchor(lasttitle3) linktext = TextFunctions.removeformatting(lasttitle3+" ("+lasttitle2+")") sortkey=linktext.lower() sortkey=re.sub('[^a-z]','',sortkey) ret.append({'sortkey':sortkey, 'link':pagetitle+"#"+anchor, 'page':shortname, 'topic':linktext}) #linktext = TextFunctions.removeformatting(lasttitle2+", "+lasttitle3) #sortkey=linktext.lower() #sortkey=re.sub('[^a-z]','',sortkey) #ret.append({'sortkey':sortkey, 'link':pagetitle+"#"+anchor, 'page':shortname, 'topic':linktext}) elif lasttitle2 != "": #wikipedia.output(u"==%s==" % lasttitle2) anchor = TextFunctions.getanchor(lasttitle2) linktext = TextFunctions.removeformatting(lasttitle2) sortkey=linktext.lower() sortkey=re.sub('[^a-z]','',sortkey) ret.append({'sortkey':sortkey, 'link':pagetitle+"#"+anchor, 'page':shortname, 'topic':linktext}) return ret def retrieve(self): ret=[] for indexpage in self.pages: if 'page' in indexpage: if 'name' not in indexpage: indexpage['name']=indexpage['page'] page = wikipedia.Page(wikipedia.getSite(), indexpage['page']) ret.extend(self.addpage(page, indexpage['name'])) elif 'pageprefix' in indexpage: if 'name' not in indexpage: indexpage['name']='' pagelist=[] subpagegen = pagegenerators.PrefixingPageGenerator(prefix = indexpage['pageprefix']) for subpage in subpagegen: if 'include' in indexpage: if re.search(indexpage['include'],subpage.title()) is not None: pagelist.append(subpage.title()) elif 'exclude' in indexpage: if re.search(indexpage['exclude'],subpage.title()) is None: pagelist.append(subpage.title()) else: pagelist.append(subpage.title()) if pagelist != []: gen = iter([wikipedia.Page(wikipedia.getSite(), t) for t in pagelist]) gen = pagegenerators.PreloadingGenerator(gen) for page in gen: ret.extend(self.addpage(page,indexpage['name']+page.title()[len(indexpage['pageprefix']):])) ret.sort(key=itemgetter('sortkey')) return ret class ArchiveBot: ''' ''' # Edit summary message that should be used. # NOTE: Put a good description here, and add translations, if possible! msg = { 'en': u'Robot: Create archive index', 'nl': u'robot: Creƫer archief index', } def __init__(self, debug, hometemplate, defaulttemplate, logbook, singlepage): """ Constructor. Parameters: * debug - If True, doesn't do any real changes, but only shows what would have been changed. """ self.generator = None self.debug = debug self.hometemplate = hometemplate self.defaulttemplate = defaulttemplate self.logbook = logbook self.singlepage = singlepage self.acceptall = False self.processed = 0 self.changecount = 0 self.errorcount = 0 self.templates = Templates() if self.singlepage is not None: self.generator = iter([wikipedia.Page(wikipedia.getSite(), self.singlepage)]) else: transclusionPage = wikipedia.Page(wikipedia.getSite(), self.hometemplate) self.generator = pagegenerators.ReferringPageGenerator(transclusionPage, onlyTemplateInclusion = True) self.generator = pagegenerators.PreloadingGenerator(self.generator) def createlog(self): if self.logbook is None: return if (self.changecount+self.errorcount)==0: return log_page = wikipedia.Page(wikipedia.getSite(), self.logbook) try: log_text = log_page.get() except (wikipedia.NoPage, wikipedia.IsRedirectPage): log_text = '' old_log_text = log_text args = [wikipedia.decodeArg(sys.argv[0])] + map(lambda s: wikipedia.decodeArg('"%s"' % s), sys.argv[1:]) log_text += '\n* Start: %s\n' % self.starttime log_text += r'* Command: <nowiki>' + u' '.join(args) + r'</nowiki>' + '\n' log_text += '* Processed: %d pages\n' % self.processed log_text += '* Changes: %d pages\n' % self.changecount log_text += '* Errors: %d pages\n' % self.errorcount log_text += '* End: %s\n' % self.endtime log_text += '----\n' com = wikipedia.translate(wikipedia.getSite(), self.msg) + ' (Log)' wikipedia.showDiff(old_log_text, log_text) if not self.debug: try: log_page.put(log_text, comment = com, minorEdit = True) #wikipedia.output('page.put()') except: wikipedia.output(u'Could not save log') def run(self): self.starttime = strftime("%d %b %Y %H:%M (%Z)") # Set the edit summary message wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), self.msg)) for page in self.generator: self.treat(page) self.endtime = strftime("%d %b %Y %H:%M (%Z)") self.createlog() def treat(self, page): """ Loads the given page, does some changes, and saves it. """ self.processed += 1 # Show the title of the page we're working on. # Highlight the title in purple. wikipedia.output(u"\03{lightpurple}%s\03{default}:" % page.title()) try: # Load the page text = page.get() except wikipedia.NoPage: wikipedia.output(u"Page %s does not exist; skipping." % page.aslink()) return except wikipedia.IsRedirectPage: wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink()) return if not page.botMayEdit(wikipedia.getSite().loggedInAs()): wikipedia.output(u"Page %s is locked for robot editing; skipping." % page.aslink()) return thisindex = IndexGenerator() fulltext = re.compile(r'(\{\{'+self.hometemplate+r'(\|[^}]*)?\}\}(.*'+self.hometemplate+r'-->)?)',re.DOTALL) tmplonly = re.compile(r'\{\{'+self.hometemplate+r'\|([^}]*)\}\}') full=fulltext.search(text) tmplopt=tmplonly.search(text) if tmplopt is not None: thisindex.readoptions(tmplopt.group(1), page.title()) else: wikipedia.output('cannot read options, using default') thisindex.readoptions('', page.title()) if thisindex.readoption('template') is None: thisindex.setoption('template',self.defaulttemplate) idx=thisindex.retrieve() checktemplate=self.templates.parsetemplate(r'<!--ROW-->%%link%%') checktext=self.templates.processindex(checktemplate,idx) checksum = zlib.adler32(checktext.encode('utf8'))&0xffffffffL #wikipedia.output("checksum=%X" % checksum) t = self.templates.get(thisindex.readoption('template')) newtext=self.templates.processindex(t,idx) if thisindex.changedchecksum(checksum): text=fulltext.sub('{{'+self.hometemplate+'|'+thisindex.getoptionstring()+'}}'+newtext+'<!--'+self.hometemplate+'-->', text) else: wikipedia.output('Not changed') ############################### # save if something was changed if text != page.get(): # show what was changed wikipedia.showDiff(page.get(), text) if not self.debug: if self.acceptall: choice = 'y' else: choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N') if choice == 'a': choice = 'y' self.acceptall = True if choice == 'y': self.changecount += 1 try: # Save the page page.put(text) #wikipedia.output('page.put()') except wikipedia.LockedPage: wikipedia.output(u"Page %s is locked; skipping." % page.aslink()) self.errorcount += 1 except wikipedia.EditConflict: wikipedia.output(u'Skipping %s because of edit conflict' % (page.title())) self.errorcount += 1 except wikipedia.SpamfilterError, error: wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url)) self.errorcount += 1 def main(): # If debug is True, doesn't do any real changes, but only show # what would have been changed. debug = False hometemplate = None logbook = None defaulttemplate = None singlepage = None # Parse command line arguments for arg in wikipedia.handleArgs(): if arg.startswith("-debug"): debug = True elif arg.startswith('-page:'): singlepage = arg[6:] elif arg.startswith('-logbook:'): logbook = arg[9:] elif arg.startswith('-defaulttemplate:'): defaulttemplate = arg[17:] elif arg.startswith('-hometemplate:'): hometemplate = arg[14:] if hometemplate is None: wikipedia.output('hometemplate is required') return bot = ArchiveBot(debug, hometemplate, defaulttemplate, logbook, singlepage) bot.run() if __name__ == "__main__": try: main() finally: wikipedia.stopme()