User:Alex brollo/voclad.py
Appearance
##!/usr/bin/python ## -*- coding: utf-8 -*- ##Test elabotazione di pdftohtml (output html, xml, txt) import os # funzioni generiche sui file e comandi di sistema import bs4 import re import pywikibot as bot # from BrolloBot3 import find_stringa, produci_lista import unicodedata as ud mul=bot.Site("mul","wikisource") it=bot.Site("it","wikisource") basePagina='<noinclude><pagequality level="1" user="BrolloBot" />'\ +'{{tst|vdll}}{{rh|%s|%d|%s}}\n----</noinclude>%s<noinclude></noinclude>' def rebuild(linee): testo="" for i in range(len(linee)): if i!=0: if int(linee[i-1]["top"])<int(linee[i]["top"]) and \ int(linee[i-1]["left"])>=int(linee[i]["left"]): testo+="\n" elif int(linee[i-1]["top"])-int(linee[i]["top"])>100: testo+="\n" #testo+=str(linee[i].contents[0]) linea=str(linee[i]).replace("</text>","") linea=linea[linea.find(">")+1:] testo+=linea testo=testo.replace("<b>","<rh>",2).replace("</b>","</rh>\n",2) return testo def cleanup(linea): linea=str(linea) linea=linea.replace("</text>","")[linea.find(">")+1:] return linea def xml2html(xml): r=re.compile(r"\n+<b>") r1=re.compile(r"\n +") r2=re.compile(r"\n\n<b>(.+?) </b>") xml=bs4.BeautifulSoup(xml,"lxml") linee=xml.find_all("text") html=rebuild(linee) open("pagina.html","w",encoding="utf-8").write(html) html=r.sub("\n\n<b>",html) html=r1.sub("<br>\n",html) html=html.replace("</i><i>","")\ .replace("</b><b>","")\ .replace("</i>\n<i>","\n")\ .replace("</b>\n<b>","\n")\ .replace("-\n","") html=r2.sub(r"\n\n<b>{{lemma|\1|l}} </b>",html) return html def rh(testo,pagina): pag=pagina-35 #testo=testo.replace("#left","").replace("#right","") b=produci_lista(testo,"<rh>","</rh>",1) print(str(b[:6])) h1=find_stringa(b[0],"<rh>","</rh>",0) h2=find_stringa(b[1],"<rh>","</rh>",0) testo=testo.replace(b[0]+"\n","")\ .replace(b[0]+"<br>\n","")\ .replace(b[1]+"\n","")\ .replace(b[1]+"<br>\n","") if h1.isdigit(): h1=h2 if pag % 2 == 0: #pagina pari testo=testo.replace("#left",h1) else: testo=testo.replace("#right",h1) testo=testo.replace("#right","").replace("#left","") testo=ud.normalize("NFC",testo) return testo def find_stringa(stringa,idi,idf,dc=0,x=None,side="left"): if side=="right": idip=stringa.rfind(idi) else: idip=stringa.find(idi) idfp=stringa.find(idf,idip+len(idi))+len(idf) if idip>-1 and idfp>0: if x!=None: while stringa[idip:idfp].count(x)>stringa[idip:idfp].count(idf): if stringa[idip:idfp].count(x)>stringa[idip:idfp].count(idf): idfp=stringa.find(idf,idfp)+len(idf) if dc==0: vvalore=stringa[idip+len(idi):idfp-len(idf)] else: vvalore=stringa[idip:idfp] else: vvalore="" return vvalore def produci_lista(testo,idi,idf,dc=1,inizio=None): t=testo[:] lista=[] while not find_stringa(t,idi,idf,1,inizio)=="": el=find_stringa(t,idi,idf,1,inizio) t=t.replace(el,"",1) if dc==0: el=find_stringa(el,idi,idf,0,inizio) lista.append(el) return lista def getXml(pagina, scrivi=True): comando=f"pdftohtml -xml ../pdf/{pagina}_PDFsam_Vocabolardlladinleterar.pdf pagina.xml" os.system(comando) xml=open("pagina.xml", encoding="utf-8").read() testo=xml2html(xml) testo=basePagina % ("#left",pagina-35,"#right",testo) testo=rh(testo,pagina) open("pagina.txt","w",encoding="utf-8").write(testo) if scrivi: bot.Page(mul,"Page:Vocabolardlladinleterar.pdf/"+str(pagina)).put(testo) return testo