Progetto:Trascrizioni/Opal/opalLib.py
Aspetto
#!/usr/bin/python # -*- coding: utf-8 -*- import os, Image, ImageChops, shutil, math, pickle,sys from urllib import FancyURLopener import internetarchive class MyOpener(FancyURLopener): version = version = 'User-Agent: Alex (+http://it.wikisource.org/wiki/Utente:Alex_brollo)' opener=MyOpener() def iaMetadata(idOpal): mtd=carica_pcl("teatro") myMetadata={} myMetadata['mediatype']='texts' myMetadata['language']='ita' myMetadata['licenseurl']='http://creativecommons.org/publicdomain/zero/1.0/' myMetadata['description']='<br /><div>Scanned by Claudio Ruggeri, <a href="http://www.opal.unito.it" rel="nofollow">Opal Libri antichi</a>, University of Turin</div>' myMetadata['collection']='opensource_media' myMetadata['subject']='Italian theater;16th century' myMetadata['creator']=nomeCognome(mtd[idOpal][0]) myMetadata['title']=mtd[idOpal][1]+" ("+nomeCognome(mtd[idOpal][0])+")" myMetadata['year']=mtd[idOpal][5] myMetadata['printer']=mtd[idOpal][4] myMetadata['city']=mtd[idOpal][3] myMetadata['description']=" ".join([mtd[idOpal][0]+".",mtd[idOpal][1]+".",mtd[idOpal][2],mtd[idOpal][3]+":",mtd[idOpal][4],mtd[idOpal][5]])+myMetadata['description'] return myMetadata def nomeCognome(autore): nomecognome=autore if "," in autore: nomecognome=autore.split(",")[1].strip()+" "+autore.split(",")[0].strip() return nomecognome def grabOpal(idOpal,idia,raccolta="teatro"): idIA=idia # saving IA id into the global variable if raccolta=="teatro" or raccolta=="t": base="http://www.opal.unito.it/psixsite/Teatro%20italiano%20del%20XVI%20e%20XVII%20secolo/Elenco%20opere/" elif raccolta=="narrativa" or raccolta=="n": base="http://www.opal.unito.it/psixsite/Narrativa%20italiana%20del%20Seicento%20(e%20dintorni)/Elenco%20opere/" elif raccolta=="miscellanea" or raccolta=="m": base="http://www.opal.unito.it/psixsite/Miscellanea%20di%20testi%20di%20genere%20diverso/Elenco%20opere/" url=base+idOpal print "Grabbing..." grab(url,idIA+".pdf") # grabbing pdf file from Opal and saving it as [idIA].pdf print "Launching opal...." opal(idIA,idOpal,tipo="tiff") # launching the main routine to extract, split, zip and upload return def opal(id, idOpal, tipo="tiff", taglio=True ): separa(id+".pdf",tipo) np=0 print "Inizio splitting" tipoFile="jpg" if tipo=="tiff": tipoFile="tif" if taglio: tT(tipoFile) # splitting routine os.remove("public_html/out2/pag-0000."+tipoFile) # deleting first empty page since frontespice must be the first image zippa(id,taglio) # zipping and uploading print "Inizio caricamento su Opal" iaUpload(id,idOpal) print "Fatto" return def grab(url,output=None): page=opener.open(url+"?action=render") content=page.read() if output==None: output=url[url.rfind("/")+1:] open(output,"wb").write(content) print output return "Fatto" def carica_pcl(nome_file, folder=""): nome_file=folder+nome_file+".pcl" f=open(nome_file) contenuto=pickle.load(f) f.close() return contenuto def salva_pcl(variabile,nome_file="dato",folder=""): nome_file=folder+nome_file+".pcl" f=open(nome_file,"w") pickle.dump(variabile, f) f.close() print "Variabile salvata nel file "+nome_file return ## # Crop borders off an image. # # @param im Source image. # @param bgcolor Background color, using either a color tuple or # a color name (1.1.4 only). # @return An image without borders, or None if there's no actual # content in the image. def autocrop(im, bgcolor): if im.mode != "RGB": im = im.convert("RGB") bg = Image.new("RGB", im.size, bgcolor) diff = ImageChops.difference(im, bg) bbox = diff.getbbox() if bbox: return im.crop(bbox) return None # no contents def separa(filepdf, tipo): l=os.listdir("public_html/out") for i in l: os.remove("public_html/out/"+i) l=os.listdir("public_html/out2") for i in l: os.remove("public_html/out2/"+i) scriptBase='gs -sDEVICE=tiff24nc -r300x300 -sCompression=lzw -dNOPAUSE -dBATCH -sOutputFile="public_html/out/pag%04d.tif" '+filepdf print scriptBase os.system(scriptBase) return def zippa(IaId,taglio=True): IaId=IaId.replace(".pdf","") if taglio: os.system("zip -r public_html/"+IaId+"_images.zip public_html/out2") else: os.system("zip -r public_html/"+IaId+"_images.zip public_html/out") return def calcola(x1,y1,m0,m1,m2,m3): delta=(x1-m0-m2)*0.1 m0=m0-delta m1=m1-delta m2=m2-delta m3=m3-delta print m0/x1*100 print m1/y1*100 print m2/x1*100 print m3/y1*100 return def tT(tipoFile="tif"): # splits images of /out into /out2 lista=[] n=1 while os.path.isfile("public_html/out/pag"+str(n).zfill(4)+"."+tipoFile): lista.append("public_html/out/pag"+str(n).zfill(4)+"."+tipoFile) n+=1 for i in range(len(lista)): jpg0=Image.open(lista[i]) jpg0=autocrop(jpg0,(255,255,255)) xy0=jpg0.size[0] #larghezza xy1=jpg0.size[1] # altezza # creating the left page jpg1=jpg0.crop((0,0,int(xy0*0.5),xy1)) jpg1.save("public_html/out2/pag-"+f0(i*2)+"."+tipoFile) #print "public_html/out2/pag-"+f0(i*2)+"."+tipoFile, #creatibg the right page jpg2=jpg0.crop((int(xy0*0.5),0,xy0,xy1)) jpg2.save("public_html/out2/pag-"+f0(i*2+1)+"."+tipoFile) #print "out2/pag-"+f0(i*2+1)+"."+tipoFile return def f0(n,w=4): n="0000"+str(n) n=n[-w:] return n def iaUpload(iaId,idOpal,test=False): # iaId=iaId.replace(".pdf","") #print "Documentazione: https://pypi.python.org/pypi/internetarchive" #print "https://archive.org/account/s3.php" metadati=iaMetadata(idOpal) item=internetarchive.Item(iaId) print "idOpal: ",idOpal print "File zip: ","public_html/"+iaId+"_images.zip" for i in metadati: print i, metadati[i] if not test: if not item.exists: item.upload("public_html/"+iaId+"_images.zip", metadata=metadati) else: print "Item "+iaId+" already exists" else: print "Item "+iaId+" not uploaded (run test)" return def main(): if len(sys.argv)>=5: idOpal=sys.argv[1] idIa=sys.argv[2] os.environ['AWS_ACCESS_KEY_ID']=sys.argv[3] os.environ['AWS_SECRET_ACCESS_KEY']=sys.argv[4] print "Ids: ",idOpal,idIa grabOpal(idOpal,idIa) else: print "Parametri insufficienti" return if __name__ == "__main__": main()
La logica:
- al momento viene chiamato dal tool itsource con:
python opalLib.py [id di Opal] [id di Internet Archive] [chiave accesso Internet Archive] [chiave riservata Internet Archive]
- main() chiama grabOpal() che acchiappa il pdf a doppia facciata Opal poi chiama opal()
- opal() è lo script centrale che di seguito:
- chiama separa() che invoca gs e estrae le pagine come tiff in public_html/out
- poi chiama tT() (routine python PIL) che croppa i margini bianchi e divide le facciate mettendole public_html/out2
- poi elimina la prima pagina (retro di copertina)
- poi chiama zippa() che carica i tiff di public_html/out2
- infine chiama iaUpload() che carica il zip su IA dopo aver recuperato i metadati da un dizionario