#!/usr/bin/python
# -*- coding: utf-8 -*-
# versione 1 28/07/2016
from PIL import Image
import os,shutil,sys,urlparse, urllib
import zipfile
from internetarchive import download
import pywikibot as bot
import pywikibot.pagegenerators as pagegenerators
import re
"""
Passi logici
1. scaricare djvu da Commons OK: downloadDjvu()
2. cercare le pagine dove viene usato Ritaglio e salvarne il codice in pagine/ OK: ma da rivedere: dowloadPagine()
3. estrarre il tiff delle pagine (il jp2 se c'è un item IA) e salvarlo in tiff/ (o in jp2)
4. basandosi sul codice pagine eseguire il ritaglio fisico come .jpg e salvarlo in jpg
Poi:
5. manualmente verificare le jpg, eventualmente ritoccarle
6. caricarle con commonist
Usa:
def ReferringPageGenerator(referredPage, followRedirects=False,
withTemplateInclusion=True,
onlyTemplateInclusion=False,
step=None, total=None, content=False):
"""
commons=bot.Site("commons","commons")
it=bot.Site("it","wikisource")
class MyOpener(urllib.FancyURLopener):
version = version = 'User-Agent: Alex (+http://it.wikisource.org/wiki/Utente:Alex_brollo)'
opener=MyOpener()
def generatorPrefix(prefisso,site=it):
g=pagegenerators.PrefixingPageGenerator(prefisso,includeredirects=False)
return g
def cleanfolder(dirpath):
if not os.path.isdir(dirpath):
os.mkdir(dirpath)
for filename in os.listdir(dirpath):
filepath = os.path.join(dirpath, filename)
try:
shutil.rmtree(filepath)
except OSError:
os.remove(filepath)
return
def downloadDjvu(base):
cleanfolder("input")
nomeDjvu="File:"+base
sito=bot.Site("commons","commons")
paginaFile=bot.Page(sito, nomeDjvu)
paginaFilepage=bot.ImagePage(paginaFile)
paginaFileUrl=paginaFilepage.fileUrl()
immagine=opener.open(paginaFileUrl+"?action=render").read()
open("input/Djvu.djvu","wb").write(immagine)
return len(immagine)
def downloadPagine(base):
cleanfolder("pagine")
g=pagegenerators.ReferringPageGenerator(bot.Page(it,"Template:Ritaglio"),onlyTemplateInclusion=True)
for pagina in g:
if "Pagina:"+base in pagina.title():
numeroPagina=pagina.title()[pagina.title().rfind("/")+1:].zfill(4)
open("pagine/"+numeroPagina+".txt","w").write(pagina.get().encode("utf-8"))
print "XXX ",pagina.title()
return
def extractTiff():
cleanfolder("tiff")
l=os.listdir("pagine")
for pagina in l:
np=pagina.replace(".txt","")
while np.startswith("0"):
np=np[1:]
djvu='input/Djvu.djvu'
imm='tiff/'+pagina.replace(".txt",'.tiff')
comando='ddjvu -page=%s -format=tiff %s %s' % (np,djvu,imm)
r=os.system(comando)
print r,comando
return
def renameJp2(delta):
l=os.listdir("jp2")
r=re.compile("_([0-9]+)\.jp2")
for f in l:
numero=r.findall(f)[0]
numero=int(numero)
numeroNew=str(numero+delta).zfill(4)
nome="jp2/"+numeroNew+".jp2"
os.rename("jp2/"+f,nome)
return
def vai(base,IA,delta):
if IA==None: # basarsi sul djvu
downloadDjvu(base)
downloadPagine(base)
extractTiff()
cropper(base,"tiff",None)
else:
if delta==None:
delta=0
download(IA,glob_pattern="*_jp2.zip",destdir="input", verbose=True,no_directory=True)
dezip(IAid+"_jp2.zip")
downloadPagine(base)
cropper(base,"jp2",delta)
return
def datiRitaglio(pagina):
t=unicode(open(pagina).read(),"utf-8")
t=t.replace("{{ritaglio","{{Ritaglio").replace("\n","")
ritagli=produci_lista(t,"{{Ritaglio","}}",1)
dati=[]
for i in range(len(ritagli)):
dati.append({})
ritagli[i]=ritagli[i].split("|")
for j in ritagli[i]:
r=re.compile(ur"file *= *(.+)")
if r.findall(j)!=[]:
dati[i]["file"]=r.findall(j)[0].strip()
r=re.compile(ur"page *= *(.+)")
if r.findall(j)!=[]:
dati[i]["page"]=r.findall(j)[0].strip()
r=re.compile(ur"width *= *(\d+)px")
if r.findall(j)!=[]:
wt=int(r.findall(j)[0])
r=re.compile(ur"wHeight *= *(\d+)px")
if r.findall(j)!=[]:
h=int(r.findall(j)[0])
r=re.compile(ur"wWidth *= *(\d+)px")
if r.findall(j)!=[]:
w=int(r.findall(j)[0])
r=re.compile(ur"wLeft *= *(\d+)px")
if r.findall(j)!=[]:
x1=int(r.findall(j)[0])
r=re.compile(ur"wTop *= *(\d+)px")
if r.findall(j)!=[]:
y1=int(r.findall(j)[0])
dati[i]["coord"]=[wt,[x1,y1,x1+w,y1+h]]
return dati
def cropper(base,tipo="tiff",delta=0): # pagina è il nome derivante da listdir, tipo 0005.txt
cleanfolder("jpg")
for pagina in os.listdir("pagine"):
disambigua="abcdefghilmno"
dati=datiRitaglio(u"pagine/"+pagina)
if len(dati)==0:
continue
else:
for i in range(len(dati)):
try:
if tipo=="tiff":
dis=disambigua[i:i+1]
tiff=Image.open("tiff/"+pagina.replace(".txt",".tiff"))
fc=tiff.size[0]*1.0/dati[i]["coord"][0]
co=dati[i]["coord"][1]
box=(int(co[0]*fc),int(co[1]*fc),int(co[2]*fc),int(co[3]*fc))
jpg=tiff.crop(box)
if len(dati)==1:
jpg.save(u"jpg/"+base.replace(".djvu","")+"-"+pagina.replace(".txt","")+".jpg")
else:
jpg.save(u"jpg/"+base.replace(".djvu","")+"-"+pagina.replace(".txt","")+dis+".jpg")
elif tipo=="jp2":
dis=disambigua[i:i+1]
jp2=Image.open("jp2/"+pagina.replace(".txt",".jp2"))
fc=jp2.size[0]*1.0/dati[i]["coord"][0]
co=dati[i]["coord"][1]
box=(int(co[0]*fc),int(co[1]*fc),int(co[2]*fc),int(co[3]*fc))
jpg=jp2.crop(box)
if len(dati)==1:
jpg.save(u"jpg/"+base.replace(".djvu","")+"-"+pagina.replace(".txt","")+".jpg")
else:
jpg.save(u"jpg/"+base.replace(".djvu","")+"-"+pagina.replace(".txt","")+dis+".jpg")
else:
print "Errore tipo file in pagina ",pagina
except:
print "Errore in pagina ",pagina
return
# utilities
# Nuova versione, gestisce i tag annidati; x e' la parte "aspecifica" del
# tag di apertura (es: {{ cercando {{Intestazione| )
def find_stringa(stringa,idi,idf,dc=0,x=None,side="left"):
if side=="right":
idip=stringa.rfind(idi)
else:
idip=stringa.find(idi)
idfp=stringa.find(idf,idip+len(idi))+len(idf)
if idip>-1 and idfp>0:
if x!=None:
while stringa[idip:idfp].count(x)>stringa[idip:idfp].count(idf):
if stringa[idip:idfp].count(x)>stringa[idip:idfp].count(idf):
idfp=stringa.find(idf,idfp)+len(idf)
if dc==0:
vvalore=stringa[idip+len(idi):idfp-len(idf)]
else:
vvalore=stringa[idip:idfp]
else:
vvalore=""
return vvalore
def produci_lista(testo,idi,idf,dc=1,inizio=None):
t=testo[:]
lista=[]
while not find_stringa(t,idi,idf,1,inizio)=="":
el=find_stringa(t,idi,idf,1,inizio)
t=t.replace(el,"",1)
if dc==0:
el=find_stringa(el,idi,idf,0,inizio)
lista.append(el)
return lista
def carica_pcl(nome_file, folder="dati/"):
nome_file=folder+nome_file+".pcl"
f=open(nome_file)
contenuto=pickle.load(f)
f.close()
return contenuto
def salva_pcl(variabile,nome_file="dato",folder="dati/"):
nome_file=folder+nome_file+".pcl"
f=open(nome_file,"w")
pickle.dump(variabile, f)
f.close()
print "Variabile salvata nel file "+nome_file
return
## routine ereditate da djvuCl.py
def path2url(path):
return urlparse.urljoin('file:', urllib.pathname2url(path))
def downloadItem(IAid):
cleanfolder("input")
download(IAid,glob_pattern="*_djvu.xml",destdir="input", verbose=True,no_directory=True)
download(IAid,glob_pattern="*_jp2.zip",destdir="input", verbose=True,no_directory=True)
return
def jp2tojpg():
cleanfolder("jpg")
listaJp2=os.listdir("jp2")
listaJp2.sort()
for f in range(len(listaJp2)):
if listaJp2[f].endswith(".jp2"):
fout=listaJp2[f][0:-4]+".jpg"
image=Image.open(os.path.join("jp2",listaJp2[f]))
if f==0 and image.size[0]<1000:
fattore=1024.0/image.size[0]
image=image.resize((int(image.size[0]*fattore),int(image.size[1]*fattore)))
image.save(os.path.join("jpg",fout))
## comando="magick convert jp2\%s jpg\%s" % (f,f[0:-4]+".jpg")
## res=os.system(comando)
print fout, " salvata"
return
def jpgtodjvu():
cleanfolder("djvu")
listaJpg=os.listdir("jpg")
for f in listaJpg:
if f.endswith(".jpg"):
comando="c44 jpg\%s djvu\%s" % (f,f[0:-4]+".djvu")
res=os.system(comando)
print res,comando
return
def merge(pathdjvu="djvu"):
listaDjvu=os.listdir(pathdjvu)
listaDjvu.sort()
lista=""
for n in range(len(listaDjvu)):
if listaDjvu[n].endswith(".djvu"):
lista+=os.path.join("djvu",listaDjvu[n])+" "
if len(lista)>7500:
break
djvuBundled=os.path.join("output",listaDjvu[0].replace("_0000.djvu",".djvu"))
comando="djvm -c %s %s" % (djvuBundled,lista)
res=os.system(comando)
print res,comando
if n<len(listaDjvu):
np=n+1
for n in range(np,len(listaDjvu)):
comando="djvm -i %s %s" % (djvuBundled,os.path.join("djvu",listaDjvu[n]))
res=os.system(comando)
print res,comando
return lista
def editXml(IAid):
xmlFile=os.path.join("input",IAid)+"_djvu.xml"
xml=open(xmlFile).read()
url=find_stringa(xml,'OBJECT data="','"',0)
urlNew=path2url(os.getcwd())+"/output/"+IAid+".djvu"
xml=xml.replace(url,urlNew)
open(xmlFile,"w").write(xml)
print "File "+IAid+"_djvu.xml modificato"
return
def caricaTesto(IAid):
editXml(IAid)
comando="djvuxmlparser %s" % (os.path.join("input",IAid+"_djvu.xml"))
print comando
res=os.system(comando)
print "risultato: ",res
return
def dezip(zipf):
cleanfolder("jp2")
z=zipfile.ZipFile(os.path.join("input",zipf))
for f in z.namelist():
jp2=f.split("/").pop()
if jp2.endswith(".jp2"):
data=z.read(f)
open(os.path.join("jp2",jp2),"wb").write(data)
print jp2," saved"
return
if __name__ == "__main__": # riceve il nome base File-Indice
if len(sys.argv)==2:
vai(sys.argv[1],None,None)
elif len(sys.argv)==3:
vai(sys.argv[1],sys.argv[2],None)
elif len(sys.argv)==4:
vai(sys.argv[1],sys.argv[2],sys.argv[3])
'{{}}'