Python:Text parsing

From Botwiki

Jump to: navigation, search

I'm writing some python procedures to parse any text file into an ordered list of "words" and "not-words", with those aims:

  1. to produce a complete dictionary of different words into a text;
  2. to produce a statistic about the number of occurrences of any word into the text;
  3. to be able to re-build exactly the same text (saving its original structure) after any words replacing.

This can be useful:

  1. to fix keyboard mistake;
  2. to transform the text adding html tags to chosen words;
  3. to have a "word fingerprint" of the text.
  4. to build a user-defined dictionary to be used with Forefox or OpenOffice (useful when dealing with ancient texts)

....

Presently it's only a collection of routines, to be used interactively into a Idle environment. I'm thinking about merging them into my bot, Alebot into it.wikisource. The output of such prodedures from http://it.wikisource.org(wiki/Il_cavallarizzo ( a plain txt transformation of it) is published here: http://spreadsheets.google.com/pub?key=pXFwvJ34HVdPuRBvyufpcBw

Having a txt file (say "libro.txt") into the same directory, simply activate the module and then call:

res1,res2=dict("libro.txt")

you'll get two variables containing the result of txt parsing and two txt files will be written into the same dir. I'll update my routines as soon I will go on with my work; I'd only like to share with you the rough idea and particularly the "parser engine", into splitter() routine.

While reading the code, consider that my python knowledge is a basic, very rough one: as I told, no classes, no regex... simply some string management and some procedures. And some fun to write running (more or less) code.


# -*- coding: cp1252 -*-


# reads a local txt  [namefile].txt file (default: a test file, "testo.txt") and parses it; saves two local resulting txt files, 
# [namefile]_voc.txt and [namefile]_stat.txt
# [namefile]_voc.txt contains all the words from [namefile].txt, simply separated by blanks
# [namefile]_stat.txt contains all the words from [namefile].txt as a list of words coupled with the number of occurrencies (words 
# and numbetrs are separated by a tab, any couple word-number into a new line, easy to copy and paste into a spreadsheet

def dict(f_input="testo.txt"):
    f_output=f_input.replace(".","_voc.")
    f=open(f_input)
    testo=f.read()
    f.close()
    global caratteri
    caratteri="ABCDEFGHILMNOPQRSTUVZJKXYWabcdefghilmnopqrstuvzjkwyx_'ìèòàù&" #list of characters that build "words"
    lista=splitter(testo)                # calls splitter(), the main parsing procedure and gets the resulting list
    voc,stat=list_to_voc(lista,f_output) # calls list_to_voc(), using the previous list for further work
    return (voc,stat)


    
# main parsing engine; converts a text of any lenght into a list of alternating "words" ("word" is a continue sequence of 
# "caratteri", the characters listed into previous routine) and "not words" (any character or sequence of characters not 
# listed there: blanks, punctuation, numbers....)
def splitter(testo):
    #inizializzazioni
    parole=[]
    elem=""
    ft=len(testo)
    
    #caricamento primo carattere
    tc=(testo[0:1] in caratteri)

    #scorrimento e analisi: here the "core" of parsing trick

    for i in range(ft): 
        elem=elem+testo[i:i+1]
        if not tc == (testo[i+1:i+2] in caratteri): #se il successivo carattere appartiene a altro tipo
            parole.append(elem) # aggiorna lista
            elem=""             # svuota elemeno corrente
            tc=not tc           # e cambia tipo

    # caricamento dell'ultima parola
    parole.append(elem)
    return parole

# crea la lista delle diverse parole  (vocabolario)
# 1. gets the "list-transform" of txt and builds a list of unique words;
# 2. writes [nomefile]_voc.txt aand [nomefile]_stat.txt
def list_to_voc(lista,f_output):
    voc=[] # inizializzazione
    stat={} 
    for i in range(len(lista)):
        if lista[i][0:1] in caratteri:
            if lista[i] not in voc:
                voc.append(lista[i])
                stat[lista[i]]=1
            else:
                stat[lista[i]]=stat[lista[i]]+1
    # ordinamento alfabetico
    voc.sort()

    #writes file output (_voc)
    f=open(f_output,"w")
    #f.write("Il testo contiene "+str(len(lista))+" parole e il vocabolario contiene "+str(len(voc))+" parole.\n\n")
    for i in range(len(voc)):
        f.write(voc[i]+" ")
    f.close()
    ##writes file output (_stat)
    f=open(f_output.replace("_voc.","_stat."),"w")
    for i in range(len(voc)):
        f.write(voc[i]+"\t"+str(stat[voc[i]])+"\n")
    f.close()
    print "Il testo contiene "+str(len(lista))+" parole e il vocabolario contiene "+str(len(voc))+" parole."
    return voc,stat

# two routines to find and replace strings (for no-regex coders ;-) as I am )   
# finds and returns the first substring beginning with a string idi and ending with a string ifd into a string
def find_stringa(stringa,idi,idf,dc=0):

    idip=stringa.find(idi)
    ##print "idip:"+str(idip)
    idfp=stringa.find(idf,idip)+len(idf)
    ###print "idfp:"+str(idfp)
    if idip>-1 and idfp>0:
        if dc==0:
            vvalore=stringa[idip+len(idi):idfp-len(idf)]
        else:
            vvalore=stringa[idip:idfp]
    else:
        vvalore=""
    return vvalore
# replaces the first substring beginning with a string idi and ending with a string ifd into a string with the string new
def el_st1(stringa,idi,idf,new=""):

    idip=stringa.find(idi)
    ###print "idip:"+str(idip)
    idfp=stringa.find(idf,idip)+len(idf)
    ##print "idfp:"+str(idfp)
    if idip>-1 and idfp>0:
        vvalore=stringa[idip:idfp]
        valore=stringa.replace(vvalore,new,1)
    else:
        valore=stringa
    return valore, vvalore


# a test routine to use previous outputs to verify and fix a different text; work in progress
def ki(filetesto="testo.txt",filevoc="vocabolario.txt",tag='<span style="color:red;">*</span>'):
    global caratteri
    caratteri="ABCDEFGHILMNOPQRSTUVZJKXYWabcdefghilmnopqrstuvzjkwyx_'ìèòàù&"
##    testo=raw_input("Testo da analizzare:")
##    voc=raw_input("Vocabolario da usare:")
##    fo=raw_input("File di output:")
    nw=0
    fileoutput=filetesto.replace(".","_corr.")
    f=open(filetesto)
    testo=f.read()
    testo=testo.replace("[Image:00%.svg pagina]","####")

    f.close()
    f=open(filevoc)
    voc=f.read()
                        
    f.close()
    voc=voc.split()
    lista=splitter(testo)
    for i in range(len(lista)):
        if lista[i][0:1] in caratteri:
            if lista[i] not in voc:
                nw=nw+1
                lista[i]=tag.replace("*", lista[i])
    f_output=open(fileoutput,"w")
    fhtm=fileoutput.replace(".txt",".htm")
    f_htm=open(fhtm,"w")
    f_htm.write("<html><body>")
    for i in range(len(lista)):
        f_output.write(lista[i].replace("####","\n\nPAGINA\n\n"))
        f_htm.write(lista[i].replace("####","<br><br>PAGINA<br><br>"))
    f_output.close()
    f_htm.write("</body></html>")
    f_htm.close()
    print "Sono state individuate "+str(nw)+" parole sospette; il testo è su "+fileoutput
    return

--Alex brollo 14:08, 28 October 2008 (UTC)

Personal tools