Python:Text parsing
From Botwiki
I'm writing some python procedures to parse any text file into an ordered list of "words" and "not-words", with those aims:
- to produce a complete dictionary of different words into a text;
- to produce a statistic about the number of occurrences of any word into the text;
- to be able to re-build exactly the same text (saving its original structure) after any words replacing.
This can be useful:
- to fix keyboard mistake;
- to transform the text adding html tags to chosen words;
- to have a "word fingerprint" of the text.
- to build a user-defined dictionary to be used with Forefox or OpenOffice (useful when dealing with ancient texts)
....
Presently it's only a collection of routines, to be used interactively into a Idle environment. I'm thinking about merging them into my bot, Alebot into it.wikisource. The output of such prodedures from http://it.wikisource.org(wiki/Il_cavallarizzo ( a plain txt transformation of it) is published here: http://spreadsheets.google.com/pub?key=pXFwvJ34HVdPuRBvyufpcBw
Having a txt file (say "libro.txt") into the same directory, simply activate the module and then call:
res1,res2=dict("libro.txt")
you'll get two variables containing the result of txt parsing and two txt files will be written into the same dir. I'll update my routines as soon I will go on with my work; I'd only like to share with you the rough idea and particularly the "parser engine", into splitter() routine.
While reading the code, consider that my python knowledge is a basic, very rough one: as I told, no classes, no regex... simply some string management and some procedures. And some fun to write running (more or less) code.
# -*- coding: cp1252 -*-
# reads a local txt [namefile].txt file (default: a test file, "testo.txt") and parses it; saves two local resulting txt files,
# [namefile]_voc.txt and [namefile]_stat.txt
# [namefile]_voc.txt contains all the words from [namefile].txt, simply separated by blanks
# [namefile]_stat.txt contains all the words from [namefile].txt as a list of words coupled with the number of occurrencies (words
# and numbetrs are separated by a tab, any couple word-number into a new line, easy to copy and paste into a spreadsheet
def dict(f_input="testo.txt"):
f_output=f_input.replace(".","_voc.")
f=open(f_input)
testo=f.read()
f.close()
global caratteri
caratteri="ABCDEFGHILMNOPQRSTUVZJKXYWabcdefghilmnopqrstuvzjkwyx_'ìèòàù&" #list of characters that build "words"
lista=splitter(testo) # calls splitter(), the main parsing procedure and gets the resulting list
voc,stat=list_to_voc(lista,f_output) # calls list_to_voc(), using the previous list for further work
return (voc,stat)
# main parsing engine; converts a text of any lenght into a list of alternating "words" ("word" is a continue sequence of
# "caratteri", the characters listed into previous routine) and "not words" (any character or sequence of characters not
# listed there: blanks, punctuation, numbers....)
def splitter(testo):
#inizializzazioni
parole=[]
elem=""
ft=len(testo)
#caricamento primo carattere
tc=(testo[0:1] in caratteri)
#scorrimento e analisi: here the "core" of parsing trick
for i in range(ft):
elem=elem+testo[i:i+1]
if not tc == (testo[i+1:i+2] in caratteri): #se il successivo carattere appartiene a altro tipo
parole.append(elem) # aggiorna lista
elem="" # svuota elemeno corrente
tc=not tc # e cambia tipo
# caricamento dell'ultima parola
parole.append(elem)
return parole
# crea la lista delle diverse parole (vocabolario)
# 1. gets the "list-transform" of txt and builds a list of unique words;
# 2. writes [nomefile]_voc.txt aand [nomefile]_stat.txt
def list_to_voc(lista,f_output):
voc=[] # inizializzazione
stat={}
for i in range(len(lista)):
if lista[i][0:1] in caratteri:
if lista[i] not in voc:
voc.append(lista[i])
stat[lista[i]]=1
else:
stat[lista[i]]=stat[lista[i]]+1
# ordinamento alfabetico
voc.sort()
#writes file output (_voc)
f=open(f_output,"w")
#f.write("Il testo contiene "+str(len(lista))+" parole e il vocabolario contiene "+str(len(voc))+" parole.\n\n")
for i in range(len(voc)):
f.write(voc[i]+" ")
f.close()
##writes file output (_stat)
f=open(f_output.replace("_voc.","_stat."),"w")
for i in range(len(voc)):
f.write(voc[i]+"\t"+str(stat[voc[i]])+"\n")
f.close()
print "Il testo contiene "+str(len(lista))+" parole e il vocabolario contiene "+str(len(voc))+" parole."
return voc,stat
# two routines to find and replace strings (for no-regex coders ;-) as I am )
# finds and returns the first substring beginning with a string idi and ending with a string ifd into a string
def find_stringa(stringa,idi,idf,dc=0):
idip=stringa.find(idi)
##print "idip:"+str(idip)
idfp=stringa.find(idf,idip)+len(idf)
###print "idfp:"+str(idfp)
if idip>-1 and idfp>0:
if dc==0:
vvalore=stringa[idip+len(idi):idfp-len(idf)]
else:
vvalore=stringa[idip:idfp]
else:
vvalore=""
return vvalore
# replaces the first substring beginning with a string idi and ending with a string ifd into a string with the string new
def el_st1(stringa,idi,idf,new=""):
idip=stringa.find(idi)
###print "idip:"+str(idip)
idfp=stringa.find(idf,idip)+len(idf)
##print "idfp:"+str(idfp)
if idip>-1 and idfp>0:
vvalore=stringa[idip:idfp]
valore=stringa.replace(vvalore,new,1)
else:
valore=stringa
return valore, vvalore
# a test routine to use previous outputs to verify and fix a different text; work in progress
def ki(filetesto="testo.txt",filevoc="vocabolario.txt",tag='<span style="color:red;">*</span>'):
global caratteri
caratteri="ABCDEFGHILMNOPQRSTUVZJKXYWabcdefghilmnopqrstuvzjkwyx_'ìèòàù&"
## testo=raw_input("Testo da analizzare:")
## voc=raw_input("Vocabolario da usare:")
## fo=raw_input("File di output:")
nw=0
fileoutput=filetesto.replace(".","_corr.")
f=open(filetesto)
testo=f.read()
testo=testo.replace("[Image:00%.svg pagina]","####")
f.close()
f=open(filevoc)
voc=f.read()
f.close()
voc=voc.split()
lista=splitter(testo)
for i in range(len(lista)):
if lista[i][0:1] in caratteri:
if lista[i] not in voc:
nw=nw+1
lista[i]=tag.replace("*", lista[i])
f_output=open(fileoutput,"w")
fhtm=fileoutput.replace(".txt",".htm")
f_htm=open(fhtm,"w")
f_htm.write("<html><body>")
for i in range(len(lista)):
f_output.write(lista[i].replace("####","\n\nPAGINA\n\n"))
f_htm.write(lista[i].replace("####","<br><br>PAGINA<br><br>"))
f_output.close()
f_htm.write("</body></html>")
f_htm.close()
print "Sono state individuate "+str(nw)+" parole sospette; il testo è su "+fileoutput
return
--Alex brollo 14:08, 28 October 2008 (UTC)
BlogMarks
del.icio.us
digg
Fark
Furl
Newsvine
reddit
Segnalo
Simpy
Slashdot
smarking
Spurl
Wists
