Progetto:Bot/Programmi in Python per i bot/abbyyXml.py

Da Wikisource.
Jump to navigation Jump to search
#!/usr/bin/python
# -*- coding: utf-8  -*-

'''
Doc rough notes
These script allow to explore Internet Archive _abbyy.gz files. 

Some variables and functions are specific for a test book (IA item bub_gb_lvzoCyRdzsoC) 
to be imported into [[D'Ayala - Dizionario militare francese italiano.djvu]].

Base functions (leggi(), pagina(), testo()) run (I hope...) for any _abbyy.gz file.

To test the scripts, download a [id]_abbyy.gz file into the same folder where abbyyXml.py lies, 
then run the script into Idle environment and run leggi("[id]"), xml file will be unzipped and 
saves as [id]_abbyy.xml and a table of <page> elements offsets and lengths into it will be saved by pickle into [id].pcl.
Both unzipping and offset table building is very fast (a few seconds for a very large xml > 300Mby using a cheap Windows 10 pc)

Next runs of leggi("[id]") will simply upload the offset-length of <page> elements.

Then run:
page=pagina(n)

and you'll get a soup.page object that can be explored; its tree is more or less:
page
   block
      text
         par
            line
                charparams
                formatting
So far, only blocks of type text have been explored, there are other types, picture and table being very interesting.

To get rough unformatted text (similar to bare OCR extracted into new pages by mediawiki software) run:
testo(page)
where page is soup.page object got by pagina(). 

Another function:
testof(page) 
gets text and some formatting, but consider that code is strictly specific for IA item bub_gb_lvzoCyRdzsoC (base name of nsPage, header...)

Good luck!
'''

from bs4 import BeautifulSoup
import lxml, gzip,pickle
from os import system,listdir,getcwd,path


import pywikibot as bot
import pywikibot.pagegenerators as pagegenerators
import pywikibot.proofreadpage as proofreadpage

mul=bot.Site("-","wikisource")
it=bot.Site("it","wikisource")

'{{}}'

IAid=""
p=[]
header=u'<noinclude><pagequality level="1" user="BrolloBot" />'+\
        u'{{RigaIntestazione|{{gap|6em}}|\u2014  \u2014|{{gap|6em}}}}</noinclude>'
footer=u'<noinclude><references/></noinclude>'
# initializes two global variables IAid (Internet Archive ID)
# and p (list of tuples of page xml code offsets

# if only [IAid]_abbyy.gz file exists it extracts and saves [IAid]_abbyy.xml
# and builds p saving it into [IAid].pcl

# if IAid]_abbyy.xml exists but [IAid].pcl doesn't it builds p and
# saves it into [IAid].pcl

# if [IAid].pcl exists it loads contents into p
def leggi(iaid):
    global IAid
    IAid=iaid
    global p
    if path.isfile(iaid+".pcl"):
        p=carica_pcl(iaid,"")
    elif path.isfile(iaid+"_abbyy.xml"):
        xml=open(iaid+"_abbyy.xml","rb").read()
        p=coord(xml,"<page","</page>")
        salva_pcl(p,iaid,"")
    elif path.isfile(iaid+"_abbyy.gz"):
        with gzip.open(iaid+"_abbyy.gz", 'rb') as f:
            xml=f.read()
            open(iaid+"_abbyy.xml","wb").write(xml)
            p=coord(xml,"<page","</page>")
            salva_pcl(p,iaid,"")
    else:
        print "nessun file utilizzabile"

# returns soup object of page n into xml with p list of tuples; offset of page number (n=n-4)
# is set for the special case of IA test item bub_gb_lvzoCyRdzsoC, usually it should be n=n-1
def pagina(n):
    n=n-4
    f=open(IAid+"_abbyy.xml","rb")
    f.seek(p[n][0])
    testo=f.read(p[n][1])
    bsPagina=BeautifulSoup(unicode(testo,"utf-8"),"xml")
    f.close()
    return bsPagina

# this finds offset and length of elements <page...>...</page> into abbyy xml file
def coord(f,s1,s2):
    l=[]
    d1=0
    while True:
        d1=f.find(s1,d1)
        if d1==-1:
            break
        d2=f.find(s2,d1)
        l.append((d1,d2+len(s2)-d1))
        d1=d2
    return l

# this extracts unformatted text 
def testo(soupObj):
    testo=""
    paragrafi=soupObj.find_all("par")
    for p in paragrafi:
        parText=""
        
        linee=p.find_all("line")
        for l in linee:
            linea=""
        
            caratteri=l.find_all("charParams")
            for c in caratteri:
                linea+=c.get_text()
            linea=linea.strip() 
            
            parText+="\n"+linea
        parText=parText.strip()
        
        parText="\n\n"+parText
        testo+=parText
    testo=testo.strip()
    return testo

# this extracts formatted text (parameters set for test book IA ID bub_gb_lvzoCyRdzsoC)
def testof(soupObj):
    testo=""
    paragrafi=soupObj.find_all("par")
    for p in paragrafi:
        parText=""
        
        linee=p.find_all("line")
        for l in linee:
            linea=""
            form=l.find_all("formatting")
            for f in form:
                ft=""
                caratteri=f.find_all("charParams")
                for c in caratteri:
                    ft+=c.get_text()
                if "italic" in f.attrs and f.attrs["italic"]=="true":
                    ft="<i>"+ft+"</i>"
                if "bold" in f.attrs and f.attrs["bold"]=="true":
                    ft="<b>"+ft+"</b>"
                if "smallcaps" in f.attrs and f.attrs["smallcaps"]=="true":
                    ft="{{Sc|"+ft+"}}"
                

                linea+=ft
            linea=linea.strip() 
            
            parText+="\n"+linea
        parText=parText.strip()
        
        parText="\n\n"+parText
        testo+=parText
    testo=header+testo.strip()+footer
    # eliminazione tag formattazione ridondanti
    testo=testo.replace("</i>\n<i>","\n").replace("</b>\n<b>","\n").replace("-}}\n","}}-\n")
    return testo

# this uploads the text into nsPage of test book
def put(n):
    testo=testof(pagina(n))
    bot.Page(it,"Pagina:D'Ayala - Dizionario militare francese italiano.djvu/"+str(n)).put(testo,"Test abbyyXml.py")
    return

        
### pickle utilities 

def carica_pcl(nome_file, folder="dati/"):
    nome_file=folder+nome_file+".pcl"
    f=open(nome_file)
    contenuto=pickle.load(f)
    f.close()
    return contenuto

def salva_pcl(variabile,nome_file="dato",folder="dati/"):
    nome_file=folder+nome_file+".pcl"
    f=open(nome_file,"w")
    pickle.dump(variabile, f)
    f.close()
    print "Variabile salvata nel file "+nome_file
    return