Progetto:Bot/Programmi in Python per i bot/fixRitaglio.py

Da Wikisource.
#!/usr/bin/python
# -*- coding: utf-8  -*-
# versione 1 28/07/2016 

from PIL import Image
import os,shutil,sys,urlparse, urllib
import zipfile
from internetarchive import download
import pywikibot as bot
import pywikibot.pagegenerators as pagegenerators
import re


"""
Passi logici

1. scaricare djvu da Commons OK: downloadDjvu()
2. cercare le pagine dove viene usato Ritaglio e salvarne il codice in pagine/ OK: ma da rivedere: dowloadPagine()
3. estrarre il tiff delle pagine (il jp2 se c'è un item IA) e salvarlo in tiff/ (o in jp2)
4. basandosi sul codice pagine eseguire il ritaglio fisico come .jpg e salvarlo in jpg

Poi:
5. manualmente verificare le jpg, eventualmente ritoccarle
6. caricarle con commonist

Usa:
def ReferringPageGenerator(referredPage, followRedirects=False,
                           withTemplateInclusion=True,
                           onlyTemplateInclusion=False,
                           step=None, total=None, content=False):

"""
commons=bot.Site("commons","commons")
it=bot.Site("it","wikisource")
class MyOpener(urllib.FancyURLopener):
    version = version = 'User-Agent: Alex (+http://it.wikisource.org/wiki/Utente:Alex_brollo)'

opener=MyOpener()

def generatorPrefix(prefisso,site=it):
    g=pagegenerators.PrefixingPageGenerator(prefisso,includeredirects=False)
    return g

def cleanfolder(dirpath):
    if not os.path.isdir(dirpath):
        os.mkdir(dirpath)
    for filename in os.listdir(dirpath):
        filepath = os.path.join(dirpath, filename)
        try:
            shutil.rmtree(filepath)
        except OSError:
            os.remove(filepath)
    return

def downloadDjvu(base):
    cleanfolder("input")
    nomeDjvu="File:"+base
    sito=bot.Site("commons","commons")
    paginaFile=bot.Page(sito, nomeDjvu)
    paginaFilepage=bot.ImagePage(paginaFile)
    paginaFileUrl=paginaFilepage.fileUrl()
    immagine=opener.open(paginaFileUrl+"?action=render").read()
    open("input/Djvu.djvu","wb").write(immagine)
    return len(immagine)
        
def downloadPagine(base):
    cleanfolder("pagine")
    g=pagegenerators.ReferringPageGenerator(bot.Page(it,"Template:Ritaglio"),onlyTemplateInclusion=True)
    for pagina in g:
        if "Pagina:"+base in pagina.title():
            numeroPagina=pagina.title()[pagina.title().rfind("/")+1:].zfill(4)
            open("pagine/"+numeroPagina+".txt","w").write(pagina.get().encode("utf-8"))
            print "XXX ",pagina.title()
    return

def extractTiff():
    cleanfolder("tiff")
    l=os.listdir("pagine")
    for pagina in l:
        np=pagina.replace(".txt","")
        while np.startswith("0"):
            np=np[1:]
        djvu='input/Djvu.djvu'
        imm='tiff/'+pagina.replace(".txt",'.tiff')
        comando='ddjvu -page=%s -format=tiff %s %s' % (np,djvu,imm)
        r=os.system(comando)
        print r,comando
    return

def renameJp2(delta):
    l=os.listdir("jp2")
    r=re.compile("_([0-9]+)\.jp2")
    for f in l:
        numero=r.findall(f)[0]
        numero=int(numero)
        numeroNew=str(numero+delta).zfill(4)
        nome="jp2/"+numeroNew+".jp2"
        os.rename("jp2/"+f,nome)
    return
        
        
def vai(base,IA,delta):
    if IA==None: # basarsi sul djvu
    
        downloadDjvu(base)
        downloadPagine(base)
        extractTiff()
        cropper(base,"tiff",None)
    else:
        if delta==None:
            delta=0
        download(IA,glob_pattern="*_jp2.zip",destdir="input", verbose=True,no_directory=True)
        dezip(IAid+"_jp2.zip")
        downloadPagine(base)
        cropper(base,"jp2",delta)
        
    return

def datiRitaglio(pagina):
    t=unicode(open(pagina).read(),"utf-8")
    t=t.replace("{{ritaglio","{{Ritaglio").replace("\n","")
    ritagli=produci_lista(t,"{{Ritaglio","}}",1)
    dati=[]
    for i in range(len(ritagli)):
        dati.append({})
        ritagli[i]=ritagli[i].split("|")
        for j in ritagli[i]:
            r=re.compile(ur"file *= *(.+)")
            if r.findall(j)!=[]:
                dati[i]["file"]=r.findall(j)[0].strip()
            r=re.compile(ur"page *= *(.+)")
            if r.findall(j)!=[]:
                dati[i]["page"]=r.findall(j)[0].strip()
            r=re.compile(ur"width *= *(\d+)px")
            if r.findall(j)!=[]:
                wt=int(r.findall(j)[0])
            r=re.compile(ur"wHeight *= *(\d+)px")
            if r.findall(j)!=[]:
                h=int(r.findall(j)[0])
            r=re.compile(ur"wWidth *= *(\d+)px")
            if r.findall(j)!=[]:
                w=int(r.findall(j)[0])
            r=re.compile(ur"wLeft *= *(\d+)px")
            if r.findall(j)!=[]:
                x1=int(r.findall(j)[0])
            r=re.compile(ur"wTop *= *(\d+)px")
            if r.findall(j)!=[]:
                y1=int(r.findall(j)[0])        
        dati[i]["coord"]=[wt,[x1,y1,x1+w,y1+h]]
    return dati

def cropper(base,tipo="tiff",delta=0): # pagina è il nome derivante da listdir, tipo 0005.txt
    cleanfolder("jpg")
    for pagina in os.listdir("pagine"):
        disambigua="abcdefghilmno"
        dati=datiRitaglio(u"pagine/"+pagina)
        if len(dati)==0:
            continue
        else:
            for i in range(len(dati)):
                try:
                    if tipo=="tiff":
                        dis=disambigua[i:i+1]
                        tiff=Image.open("tiff/"+pagina.replace(".txt",".tiff"))
                        fc=tiff.size[0]*1.0/dati[i]["coord"][0]
                        co=dati[i]["coord"][1]
                        box=(int(co[0]*fc),int(co[1]*fc),int(co[2]*fc),int(co[3]*fc))
                        jpg=tiff.crop(box)
                        if len(dati)==1:
                            jpg.save(u"jpg/"+base.replace(".djvu","")+"-"+pagina.replace(".txt","")+".jpg")
                        else:
                            jpg.save(u"jpg/"+base.replace(".djvu","")+"-"+pagina.replace(".txt","")+dis+".jpg")
                    elif tipo=="jp2":
                        dis=disambigua[i:i+1]
                        jp2=Image.open("jp2/"+pagina.replace(".txt",".jp2"))
                        fc=jp2.size[0]*1.0/dati[i]["coord"][0]
                        co=dati[i]["coord"][1]
                        box=(int(co[0]*fc),int(co[1]*fc),int(co[2]*fc),int(co[3]*fc))
                        jpg=jp2.crop(box)
                        if len(dati)==1:
                            jpg.save(u"jpg/"+base.replace(".djvu","")+"-"+pagina.replace(".txt","")+".jpg")
                        else:
                            jpg.save(u"jpg/"+base.replace(".djvu","")+"-"+pagina.replace(".txt","")+dis+".jpg")
                    else:
                        print "Errore tipo file in pagina ",pagina
                except:
                    print "Errore in pagina ",pagina
    
      

    return
        
        
    

# utilities 
# Nuova versione, gestisce i tag annidati; x e' la parte "aspecifica" del
# tag di apertura (es: {{ cercando {{Intestazione| )
def find_stringa(stringa,idi,idf,dc=0,x=None,side="left"):
    if side=="right":
        idip=stringa.rfind(idi)
    else:
        idip=stringa.find(idi)
    idfp=stringa.find(idf,idip+len(idi))+len(idf)
    if idip>-1 and idfp>0:
        if x!=None:
            while stringa[idip:idfp].count(x)>stringa[idip:idfp].count(idf):
                if stringa[idip:idfp].count(x)>stringa[idip:idfp].count(idf):
                    idfp=stringa.find(idf,idfp)+len(idf)
                
        if dc==0:
            vvalore=stringa[idip+len(idi):idfp-len(idf)]
        else:
            vvalore=stringa[idip:idfp]
    else:
        vvalore=""
    return vvalore

def produci_lista(testo,idi,idf,dc=1,inizio=None):
    t=testo[:]
    lista=[]
    while not find_stringa(t,idi,idf,1,inizio)=="":
        el=find_stringa(t,idi,idf,1,inizio)
        t=t.replace(el,"",1)
        if dc==0:
            el=find_stringa(el,idi,idf,0,inizio)
        lista.append(el)
    return lista

def carica_pcl(nome_file, folder="dati/"):
    nome_file=folder+nome_file+".pcl"
    f=open(nome_file)
    contenuto=pickle.load(f)
    f.close()
    return contenuto

def salva_pcl(variabile,nome_file="dato",folder="dati/"):
    nome_file=folder+nome_file+".pcl"
    f=open(nome_file,"w")
    pickle.dump(variabile, f)
    f.close()
    print "Variabile salvata nel file "+nome_file
    return 

## routine ereditate da djvuCl.py

def path2url(path):
    return urlparse.urljoin('file:', urllib.pathname2url(path))

def downloadItem(IAid):
        cleanfolder("input")
        download(IAid,glob_pattern="*_djvu.xml",destdir="input", verbose=True,no_directory=True)
        download(IAid,glob_pattern="*_jp2.zip",destdir="input", verbose=True,no_directory=True)
        return

def jp2tojpg():
        cleanfolder("jpg")
        listaJp2=os.listdir("jp2")
        listaJp2.sort()
        for f in range(len(listaJp2)):
                if listaJp2[f].endswith(".jp2"):
                        fout=listaJp2[f][0:-4]+".jpg"
                        image=Image.open(os.path.join("jp2",listaJp2[f]))
                        if f==0 and image.size[0]<1000:
                                fattore=1024.0/image.size[0]
                                image=image.resize((int(image.size[0]*fattore),int(image.size[1]*fattore)))
                        image.save(os.path.join("jpg",fout))
##                comando="magick convert jp2\%s jpg\%s" % (f,f[0:-4]+".jpg")
##                res=os.system(comando)
                        print fout, " salvata"
        return

                

def jpgtodjvu():
        cleanfolder("djvu")
        listaJpg=os.listdir("jpg")
        for f in listaJpg:
                if f.endswith(".jpg"):
                        comando="c44 jpg\%s djvu\%s" % (f,f[0:-4]+".djvu")
                        res=os.system(comando)
                        print res,comando
        return
                
def merge(pathdjvu="djvu"):
        listaDjvu=os.listdir(pathdjvu)
        listaDjvu.sort()
        lista=""
        for n in range(len(listaDjvu)):
                if listaDjvu[n].endswith(".djvu"):
                        lista+=os.path.join("djvu",listaDjvu[n])+" "
                if len(lista)>7500:
                        break
        
        djvuBundled=os.path.join("output",listaDjvu[0].replace("_0000.djvu",".djvu"))
        comando="djvm -c %s %s" % (djvuBundled,lista)
        res=os.system(comando)
        print res,comando
        if n<len(listaDjvu):
                np=n+1
                for n in range(np,len(listaDjvu)):
                        comando="djvm -i %s %s" % (djvuBundled,os.path.join("djvu",listaDjvu[n]))
                        res=os.system(comando)
                        print res,comando
        return lista



def editXml(IAid):
        xmlFile=os.path.join("input",IAid)+"_djvu.xml"
        xml=open(xmlFile).read()
        url=find_stringa(xml,'OBJECT data="','"',0)
        urlNew=path2url(os.getcwd())+"/output/"+IAid+".djvu"
        xml=xml.replace(url,urlNew)
        open(xmlFile,"w").write(xml)
        print "File "+IAid+"_djvu.xml modificato"
        return

def caricaTesto(IAid):
        editXml(IAid)
        comando="djvuxmlparser %s" % (os.path.join("input",IAid+"_djvu.xml"))
        print comando
        res=os.system(comando)
        print "risultato: ",res
        return


def dezip(zipf):
        cleanfolder("jp2")
        z=zipfile.ZipFile(os.path.join("input",zipf))
        for f in z.namelist():
                jp2=f.split("/").pop()
                if jp2.endswith(".jp2"):
                        data=z.read(f)
                        open(os.path.join("jp2",jp2),"wb").write(data)
                print jp2," saved"
        return


          
if __name__ == "__main__": # riceve il nome base File-Indice
    if len(sys.argv)==2:
        vai(sys.argv[1],None,None)
    elif len(sys.argv)==3:
        vai(sys.argv[1],sys.argv[2],None)
    elif len(sys.argv)==4:
        vai(sys.argv[1],sys.argv[2],sys.argv[3])
          
                                          

'{{}}'