Progetto:Bot/Programmi in Python per i bot/abbyyXml.py
Aspetto
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Doc rough notes
These script allow to explore Internet Archive _abbyy.gz files.
Some variables and functions are specific for a test book (IA item bub_gb_lvzoCyRdzsoC)
to be imported into [[D'Ayala - Dizionario militare francese italiano.djvu]].
Base functions (leggi(), pagina(), testo()) run (I hope...) for any _abbyy.gz file.
To test the scripts, download a [id]_abbyy.gz file into the same folder where abbyyXml.py lies,
then run the script into Idle environment and run leggi("[id]"), xml file will be unzipped and
saves as [id]_abbyy.xml and a table of <page> elements offsets and lengths into it will be saved by pickle into [id].pcl.
Both unzipping and offset table building is very fast (a few seconds for a very large xml > 300Mby using a cheap Windows 10 pc)
Next runs of leggi("[id]") will simply upload the offset-length of <page> elements.
Then run:
page=pagina(n)
and you'll get a soup.page object that can be explored; its tree is more or less:
page
block
text
par
line
charparams
formatting
So far, only blocks of type text have been explored, there are other types, picture and table being very interesting.
To get rough unformatted text (similar to bare OCR extracted into new pages by mediawiki software) run:
testo(page)
where page is soup.page object got by pagina().
Another function:
testof(page)
gets text and some formatting, but consider that code is strictly specific for IA item bub_gb_lvzoCyRdzsoC (base name of nsPage, header...)
Good luck!
'''
from bs4 import BeautifulSoup
import lxml, gzip,pickle
from os import system,listdir,getcwd,path
import pywikibot as bot
import pywikibot.pagegenerators as pagegenerators
import pywikibot.proofreadpage as proofreadpage
mul=bot.Site("-","wikisource")
it=bot.Site("it","wikisource")
'{{}}'
IAid=""
p=[]
header=u'<noinclude><pagequality level="1" user="BrolloBot" />'+\
u'{{RigaIntestazione|{{gap|6em}}|\u2014 \u2014|{{gap|6em}}}}</noinclude>'
footer=u'<noinclude><references/></noinclude>'
# initializes two global variables IAid (Internet Archive ID)
# and p (list of tuples of page xml code offsets
# if only [IAid]_abbyy.gz file exists it extracts and saves [IAid]_abbyy.xml
# and builds p saving it into [IAid].pcl
# if IAid]_abbyy.xml exists but [IAid].pcl doesn't it builds p and
# saves it into [IAid].pcl
# if [IAid].pcl exists it loads contents into p
def leggi(iaid):
global IAid
IAid=iaid
global p
if path.isfile(iaid+".pcl"):
p=carica_pcl(iaid,"")
elif path.isfile(iaid+"_abbyy.xml"):
xml=open(iaid+"_abbyy.xml","rb").read()
p=coord(xml,"<page","</page>")
salva_pcl(p,iaid,"")
elif path.isfile(iaid+"_abbyy.gz"):
with gzip.open(iaid+"_abbyy.gz", 'rb') as f:
xml=f.read()
open(iaid+"_abbyy.xml","wb").write(xml)
p=coord(xml,"<page","</page>")
salva_pcl(p,iaid,"")
else:
print "nessun file utilizzabile"
# returns soup object of page n into xml with p list of tuples; offset of page number (n=n-4)
# is set for the special case of IA test item bub_gb_lvzoCyRdzsoC, usually it should be n=n-1
def pagina(n):
n=n-4
f=open(IAid+"_abbyy.xml","rb")
f.seek(p[n][0])
testo=f.read(p[n][1])
bsPagina=BeautifulSoup(unicode(testo,"utf-8"),"xml")
f.close()
return bsPagina
# this finds offset and length of elements <page...>...</page> into abbyy xml file
def coord(f,s1,s2):
l=[]
d1=0
while True:
d1=f.find(s1,d1)
if d1==-1:
break
d2=f.find(s2,d1)
l.append((d1,d2+len(s2)-d1))
d1=d2
return l
# this extracts unformatted text
def testo(soupObj):
testo=""
paragrafi=soupObj.find_all("par")
for p in paragrafi:
parText=""
linee=p.find_all("line")
for l in linee:
linea=""
caratteri=l.find_all("charParams")
for c in caratteri:
linea+=c.get_text()
linea=linea.strip()
parText+="\n"+linea
parText=parText.strip()
parText="\n\n"+parText
testo+=parText
testo=testo.strip()
return testo
# this extracts formatted text (parameters set for test book IA ID bub_gb_lvzoCyRdzsoC)
def testof(soupObj):
testo=""
paragrafi=soupObj.find_all("par")
for p in paragrafi:
parText=""
linee=p.find_all("line")
for l in linee:
linea=""
form=l.find_all("formatting")
for f in form:
ft=""
caratteri=f.find_all("charParams")
for c in caratteri:
ft+=c.get_text()
if "italic" in f.attrs and f.attrs["italic"]=="true":
ft="<i>"+ft+"</i>"
if "bold" in f.attrs and f.attrs["bold"]=="true":
ft="<b>"+ft+"</b>"
if "smallcaps" in f.attrs and f.attrs["smallcaps"]=="true":
ft="{{Sc|"+ft+"}}"
linea+=ft
linea=linea.strip()
parText+="\n"+linea
parText=parText.strip()
parText="\n\n"+parText
testo+=parText
testo=header+testo.strip()+footer
# eliminazione tag formattazione ridondanti
testo=testo.replace("</i>\n<i>","\n").replace("</b>\n<b>","\n").replace("-}}\n","}}-\n")
return testo
# this uploads the text into nsPage of test book
def put(n):
testo=testof(pagina(n))
bot.Page(it,"Pagina:D'Ayala - Dizionario militare francese italiano.djvu/"+str(n)).put(testo,"Test abbyyXml.py")
return
### pickle utilities
def carica_pcl(nome_file, folder="dati/"):
nome_file=folder+nome_file+".pcl"
f=open(nome_file)
contenuto=pickle.load(f)
f.close()
return contenuto
def salva_pcl(variabile,nome_file="dato",folder="dati/"):
nome_file=folder+nome_file+".pcl"
f=open(nome_file,"w")
pickle.dump(variabile, f)
f.close()
print "Variabile salvata nel file "+nome_file
return