void-packages/srcpkgs/k2pdfopt/patches/03-willuslib-ocrtess.c-tess...

diff -r -u0 willuslib/ocrtess.c willuslib/ocrtess.c
--- willuslib/ocrtess.c	2018-12-31 19:59:58.000000000 +0100
+++ willuslib/ocrtess.c	2019-07-27 18:47:06.706765733 +0200
@@ -29,0 +30,258 @@
+
+
+/*
+** ocr_type=0:  OEM_DEFAULT
+** ocr_type=1:  OEM_TESSERACT_ONLY
+** ocr_type=2:  OEM_LSTM_ONLY
+** ocr_type=3:  OEM_TESSERACT_LSTM_COMBINED
+*/
+void *tess_capi_init(char *datapath,char *language,int ocr_type,FILE *out,
+                     char *initstr,int maxlen,int *status)
+
+    {
+    char original_locale[256];
+    TessBaseAPI *api = TessBaseAPICreate();
+
+    /* willus mod, 11-24-16 */
+    /* Tesseract needs "C" locale to correctly parse all data .traineddata files. */
+
+    strncpy(original_locale,setlocale(LC_ALL,NULL),255);
+    original_locale[255]='\0';
+    setlocale(LC_ALL,"C");
+
+    // Make the order of args a bit more forgiving than it used to be.
+    const char* lang = "eng";
+    TessPageSegMode pagesegmode = PSM_SINGLE_BLOCK;
+    if (language!=NULL && language[0]!='\0')
+        lang = language;
+ 
+/*
+v4.00 loads either TESSERACT enginer, LSTM engine, or both.  No CUBE.
+*/
+    ocr_type=0; /* Ignore specified and use default */
+    TessBaseAPISetOutputName(api, NULL);
+    (*status)=TessBaseAPIInit2(api, datapath,lang,
+              ocr_type==0 ? OEM_DEFAULT :
+                (ocr_type==1 ? OEM_TESSERACT_ONLY :
+                   (ocr_type==2 ? OEM_LSTM_ONLY :
+                                  (OEM_TESSERACT_LSTM_COMBINED))));
+    if ((*status)!=0)
+        {
+        /* willus mod, 11-24-16 */
+        setlocale(LC_ALL,original_locale);
+        TessBaseAPIEnd(api);
+        TessBaseAPIDelete(api);
+        return(NULL);
+        }
+    /*
+    api.Init("tesscapi",lang,tesseract::OEM_DEFAULT,
+           &(argv[arg]), argc - arg, NULL, NULL, false);
+    */
+    // We have 2 possible sources of pagesegmode: a config file and
+    // the command line. For backwards compatability reasons, the
+    // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
+    // default for this program is tesseract::PSM_AUTO. We will let
+    // the config file take priority, so the command-line default
+    // can take priority over the tesseract default, so we use the
+    // value from the command line only if the retrieved mode
+    // is still tesseract::PSM_SINGLE_BLOCK, indicating no change
+    // in any config file. Therefore the only way to force
+    // tesseract::PSM_SINGLE_BLOCK is from the command line.
+    // It would be simpler if we could set the value before Init,
+    // but that doesn't work.
+    if (TessBaseAPIGetPageSegMode(api) == PSM_SINGLE_BLOCK)
+        TessBaseAPISetPageSegMode(api, pagesegmode);
+
+    /*
+    ** Initialization message
+    */
+    {
+    char istr[1024];
+
+// printf("tessedit_ocr_engine_mode = %d\n",tessedit_ocr_engine_mode);
+    sprintf(istr,"%s",TessVersion());
+    sprintf(&istr[strlen(istr)],"\n    Tesseract data folder = '%s'",datapath==NULL?getenv("TESSDATA_PREFIX"):datapath);
+    strcat(istr,"\n    Tesseract languages: ");
+    char** languages = TessBaseAPIGetAvailableLanguagesAsVector(api);
+/*
+printf("OEM=%d\n",api->oem());
+printf("Langs='%s'\n",api->GetInitLanguagesAsString());
+printf("AnyTessLang()=%d\n",(int)api->tesseract()->AnyTessLang());
+printf("AnyLSTMLang()=%d\n",(int)api->tesseract()->AnyLSTMLang());
+printf("num_sub_langs()=%d\n",api->tesseract()->num_sub_langs());
+printf("languages.size()=%d\n",(int)languages.size());
+*/
+    char* l = languages;
+    int eng = 0;
+    TessBaseAPI *lang1; 
+    while (l != NULL)
+        {
+            eng=(int)TessBaseAPIOem(api);
+            sprintf(&istr[strlen(istr)],"%s%s [%s]",l==languages?"":", ",l,
+                    eng==2?"LSTM+Tess":(eng==1?"LSTM":"Tess"));
+            l++;
+        }
+    
+    TessDeleteTextArray(languages);
+
+    /*
+    if (ocr_type==0 || ocr_type==3)
+        sprintf(&istr[strlen(istr)],"[LSTM+] (lang=");
+    else if (ocr_type==2)
+        sprintf(&istr[strlen(istr)],"[LSTM] (lang=");
+    strncpy(&istr[strlen(istr)],language,253-strlen(istr));
+    istr[253]='\0';
+    strcat(istr,")");
+    */
+    if (out!=NULL)
+        fprintf(out,"%s\n",istr);
+    if (initstr!=NULL)
+        {
+        strncpy(initstr,istr,maxlen-1);
+        initstr[maxlen-1]='\0';
+        }
+    }
+
+
+    /* Turn off LSTM debugging output */
+    TessBaseAPISetVariable(api,"lstm_debug_level","0");
+#if (WILLUSDEBUG & 1)
+    TessBaseAPISetVariable(api,"lstm_debug_level","9");
+    TessBaseAPISetVariable(api,"paragraph_debug_level","9");
+    TessBaseAPISetVariable(api,"tessdata_manager_debug_level","9");
+    TessBaseAPISetVariable(api,"tosp_debug_level","9");
+    TessBaseAPISetVariable(api,"wordrec_debug_level","9");
+    TessBaseAPISetVariable(api,"segsearch_debug_level","9");
+#endif
+    /* willus mod, 11-24-16 */
+    setlocale(LC_ALL,original_locale);
+    return((void *)api);
+    }
+
+
+int tess_capi_get_ocr(void *vapi,PIX *pix,char *outstr,int maxlen,int segmode,FILE *out)
+
+    {
+    TessBaseAPI *api;
+    static int old_segmode=-1;
+
+    api=(TessBaseAPI *)vapi;
+    if (old_segmode != segmode)
+        {
+        old_segmode=segmode;
+        TessBaseAPISetPageSegMode(api, (TessPageSegMode)segmode);
+        }
+    if (!TessBaseAPIProcessPage(api,pix,0,NULL,NULL,0,NULL))
+        {
+        /* pixDestroy(&pix); */
+        if (out!=NULL)
+            fprintf(out,"tesscapi:  Error during bitmap processing.\n");
+        TessBaseAPIClear(api);
+        return(-1);
+        }
+    char* text = TessBaseAPIGetUTF8Text(api);
+    strncpy(outstr,text,maxlen-1);
+    outstr[maxlen-1]='\0';
+    TessDeleteText(text);
+    TessBaseAPIClear(api);
+    return(0);
+    }
+
+
+int tess_capi_get_ocr_multiword(void *vapi,PIX *pix,int segmode,
+                                int **left,int **top,int **right,int **bottom,
+                                int **ybase,char **text,int *nw,
+                                FILE *out)
+
+    {
+    TessBaseAPI *api;
+    static int old_segmode=-1;
+
+    api=(TessBaseAPI *)vapi;
+    if (old_segmode != segmode)
+        {
+        old_segmode=segmode;
+        TessBaseAPISetPageSegMode(api, (TessPageSegMode)segmode);
+        }
+    if (!TessBaseAPIProcessPage(api,pix,0,NULL,NULL,0,NULL))
+        {
+        if (out!=NULL)
+            fprintf(out,"tesscapi:  Error during bitmap processing.\n");
+        TessBaseAPIClear(api);
+        (*nw)=0;
+        return(-1);
+        }
+    
+    //(*nw)=api->GetOCRWords(left,top,right,bottom,ybase,text);
+    
+    int iword,nwords,totlen,it8;
+    int *x0,*y0,*x1,*y1,*ybaseline;
+    char *tutf8;
+
+    TessResultIterator *res_it = TessBaseAPIGetIterator(api);
+    /* Count words */
+    iword=0;
+    totlen=0;
+    while(TessResultIteratorNext(res_it, RIL_BLOCK))
+        {
+        if(!TessResultIteratorNext(res_it, RIL_WORD))
+            {
+            continue;
+            }
+        iword++;
+        char* textstr = TessResultIteratorGetUTF8Text(res_it, RIL_WORD);
+        totlen+=strlen(textstr)+1;
+        }
+    nwords = iword;
+    
+    x0=(*left)=(int *)malloc(sizeof(int)*5*nwords);
+    y0=(*top)=&x0[nwords];
+    x1=(*right)=&y0[nwords];
+    y1=(*bottom)=&x1[nwords];
+    ybaseline=(*ybase)=&y1[nwords];
+    tutf8=(*text)=(char *)malloc(totlen);
+    iword=0;
+    it8=0;
+    TessPageIteratorBegin( (TessPageIterator *) res_it);
+    while (TessResultIteratorNext(res_it, RIL_BLOCK))
+        {
+        if (!TessResultIteratorNext(res_it, RIL_WORD))
+            {
+            continue;
+            }
+        char* textstr = TessResultIteratorGetUTF8Text(res_it, RIL_WORD);
+        strcpy(&tutf8[it8],textstr);
+        it8 += strlen(&tutf8[it8])+1;
+        
+        int bbleft, bbtop, bbright, bbbottom;
+        int u1,v1,u2,v2;
+        TessPageIteratorBoundingBox( (TessPageIterator *) res_it, RIL_WORD, &bbleft, &bbtop, &bbright, &bbbottom);
+        TessPageIteratorBaseline( (TessPageIterator *) res_it, RIL_WORD, &u1, &v1, &u2, &v2);
+        x0[iword]=bbleft;
+        x1[iword]=bbright;
+        y0[iword]=bbtop;
+        y1[iword]=bbbottom;
+        ybaseline[iword]=(v1+v2)/2;
+        iword++;
+        }
+        
+    TessResultIteratorDelete(res_it);
+    
+    (*nw) = iword;
+    
+    TessBaseAPIClear(api);
+    return(0);
+    }
+
+
+void tess_capi_end(void *vapi)
+
+    {
+    TessBaseAPI *api;
+
+    if (vapi==NULL)
+        return;
+    api=(TessBaseAPI *)vapi;
+    TessBaseAPIEnd(api);
+    TessBaseAPIDelete(api);
+    }
New package: k2pdfopt 2019-07-27 20:49:45 +02:00			`diff -r -u0 willuslib/ocrtess.c willuslib/ocrtess.c`
			`--- willuslib/ocrtess.c 2018-12-31 19:59:58.000000000 +0100`
			`+++ willuslib/ocrtess.c 2019-07-27 18:47:06.706765733 +0200`
			`@@ -29,0 +30,258 @@`
			`+`
			`+`
			`+/*`
			`+** ocr_type=0: OEM_DEFAULT`
			`+** ocr_type=1: OEM_TESSERACT_ONLY`
			`+** ocr_type=2: OEM_LSTM_ONLY`
			`+** ocr_type=3: OEM_TESSERACT_LSTM_COMBINED`
			`+*/`
			`+void tess_capi_init(char datapath,char language,int ocr_type,FILE out,`
			`+ char initstr,int maxlen,int status)`
			`+`
			`+ {`
			`+ char original_locale[256];`
			`+ TessBaseAPI *api = TessBaseAPICreate();`
			`+`
			`+ /* willus mod, 11-24-16 */`
			`+ /* Tesseract needs "C" locale to correctly parse all data .traineddata files. */`
			`+`
			`+ strncpy(original_locale,setlocale(LC_ALL,NULL),255);`
			`+ original_locale[255]='\0';`
			`+ setlocale(LC_ALL,"C");`
			`+`
			`+ // Make the order of args a bit more forgiving than it used to be.`
			`+ const char* lang = "eng";`
			`+ TessPageSegMode pagesegmode = PSM_SINGLE_BLOCK;`
			`+ if (language!=NULL && language[0]!='\0')`
			`+ lang = language;`
			`+`
			`+/*`
			`+v4.00 loads either TESSERACT enginer, LSTM engine, or both. No CUBE.`
			`+*/`
			`+ ocr_type=0; /* Ignore specified and use default */`
			`+ TessBaseAPISetOutputName(api, NULL);`
			`+ (*status)=TessBaseAPIInit2(api, datapath,lang,`
			`+ ocr_type==0 ? OEM_DEFAULT :`
			`+ (ocr_type==1 ? OEM_TESSERACT_ONLY :`
			`+ (ocr_type==2 ? OEM_LSTM_ONLY :`
			`+ (OEM_TESSERACT_LSTM_COMBINED))));`
			`+ if ((*status)!=0)`
			`+ {`
			`+ /* willus mod, 11-24-16 */`
			`+ setlocale(LC_ALL,original_locale);`
			`+ TessBaseAPIEnd(api);`
			`+ TessBaseAPIDelete(api);`
			`+ return(NULL);`
			`+ }`
			`+ /*`
			`+ api.Init("tesscapi",lang,tesseract::OEM_DEFAULT,`
			`+ &(argv[arg]), argc - arg, NULL, NULL, false);`
			`+ */`
			`+ // We have 2 possible sources of pagesegmode: a config file and`
			`+ // the command line. For backwards compatability reasons, the`
			`+ // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the`
			`+ // default for this program is tesseract::PSM_AUTO. We will let`
			`+ // the config file take priority, so the command-line default`
			`+ // can take priority over the tesseract default, so we use the`
			`+ // value from the command line only if the retrieved mode`
			`+ // is still tesseract::PSM_SINGLE_BLOCK, indicating no change`
			`+ // in any config file. Therefore the only way to force`
			`+ // tesseract::PSM_SINGLE_BLOCK is from the command line.`
			`+ // It would be simpler if we could set the value before Init,`
			`+ // but that doesn't work.`
			`+ if (TessBaseAPIGetPageSegMode(api) == PSM_SINGLE_BLOCK)`
			`+ TessBaseAPISetPageSegMode(api, pagesegmode);`
			`+`
			`+ /*`
			`+ ** Initialization message`
			`+ */`
			`+ {`
			`+ char istr[1024];`
			`+`
			`+// printf("tessedit_ocr_engine_mode = %d\n",tessedit_ocr_engine_mode);`
			`+ sprintf(istr,"%s",TessVersion());`
			`+ sprintf(&istr[strlen(istr)],"\n Tesseract data folder = '%s'",datapath==NULL?getenv("TESSDATA_PREFIX"):datapath);`
			`+ strcat(istr,"\n Tesseract languages: ");`
			`+ char** languages = TessBaseAPIGetAvailableLanguagesAsVector(api);`
			`+/*`
			`+printf("OEM=%d\n",api->oem());`
			`+printf("Langs='%s'\n",api->GetInitLanguagesAsString());`
			`+printf("AnyTessLang()=%d\n",(int)api->tesseract()->AnyTessLang());`
			`+printf("AnyLSTMLang()=%d\n",(int)api->tesseract()->AnyLSTMLang());`
			`+printf("num_sub_langs()=%d\n",api->tesseract()->num_sub_langs());`
			`+printf("languages.size()=%d\n",(int)languages.size());`
			`+*/`
			`+ char* l = languages;`
			`+ int eng = 0;`
			`+ TessBaseAPI *lang1;`
			`+ while (l != NULL)`
			`+ {`
			`+ eng=(int)TessBaseAPIOem(api);`
			`+ sprintf(&istr[strlen(istr)],"%s%s [%s]",l==languages?"":", ",l,`
			`+ eng==2?"LSTM+Tess":(eng==1?"LSTM":"Tess"));`
			`+ l++;`
			`+ }`
			`+`
			`+ TessDeleteTextArray(languages);`
			`+`
			`+ /*`
			`+ if (ocr_type==0 \|\| ocr_type==3)`
			`+ sprintf(&istr[strlen(istr)],"[LSTM+] (lang=");`
			`+ else if (ocr_type==2)`
			`+ sprintf(&istr[strlen(istr)],"[LSTM] (lang=");`
			`+ strncpy(&istr[strlen(istr)],language,253-strlen(istr));`
			`+ istr[253]='\0';`
			`+ strcat(istr,")");`
			`+ */`
			`+ if (out!=NULL)`
			`+ fprintf(out,"%s\n",istr);`
			`+ if (initstr!=NULL)`
			`+ {`
			`+ strncpy(initstr,istr,maxlen-1);`
			`+ initstr[maxlen-1]='\0';`
			`+ }`
			`+ }`
			`+`
			`+`
			`+ /* Turn off LSTM debugging output */`
			`+ TessBaseAPISetVariable(api,"lstm_debug_level","0");`
			`+#if (WILLUSDEBUG & 1)`
			`+ TessBaseAPISetVariable(api,"lstm_debug_level","9");`
			`+ TessBaseAPISetVariable(api,"paragraph_debug_level","9");`
			`+ TessBaseAPISetVariable(api,"tessdata_manager_debug_level","9");`
			`+ TessBaseAPISetVariable(api,"tosp_debug_level","9");`
			`+ TessBaseAPISetVariable(api,"wordrec_debug_level","9");`
			`+ TessBaseAPISetVariable(api,"segsearch_debug_level","9");`
			`+#endif`
			`+ /* willus mod, 11-24-16 */`
			`+ setlocale(LC_ALL,original_locale);`
			`+ return((void *)api);`
			`+ }`
			`+`
			`+`
			`+int tess_capi_get_ocr(void vapi,PIX pix,char outstr,int maxlen,int segmode,FILE out)`
			`+`
			`+ {`
			`+ TessBaseAPI *api;`
			`+ static int old_segmode=-1;`
			`+`
			`+ api=(TessBaseAPI *)vapi;`
			`+ if (old_segmode != segmode)`
			`+ {`
			`+ old_segmode=segmode;`
			`+ TessBaseAPISetPageSegMode(api, (TessPageSegMode)segmode);`
			`+ }`
			`+ if (!TessBaseAPIProcessPage(api,pix,0,NULL,NULL,0,NULL))`
			`+ {`
			`+ /* pixDestroy(&pix); */`
			`+ if (out!=NULL)`
			`+ fprintf(out,"tesscapi: Error during bitmap processing.\n");`
			`+ TessBaseAPIClear(api);`
			`+ return(-1);`
			`+ }`
			`+ char* text = TessBaseAPIGetUTF8Text(api);`
			`+ strncpy(outstr,text,maxlen-1);`
			`+ outstr[maxlen-1]='\0';`
			`+ TessDeleteText(text);`
			`+ TessBaseAPIClear(api);`
			`+ return(0);`
			`+ }`
			`+`
			`+`
			`+int tess_capi_get_ocr_multiword(void vapi,PIX pix,int segmode,`
			`+ int left,int top,int right,int bottom,`
			`+ int ybase,char text,int *nw,`
			`+ FILE *out)`
			`+`
			`+ {`
			`+ TessBaseAPI *api;`
			`+ static int old_segmode=-1;`
			`+`
			`+ api=(TessBaseAPI *)vapi;`
			`+ if (old_segmode != segmode)`
			`+ {`
			`+ old_segmode=segmode;`
			`+ TessBaseAPISetPageSegMode(api, (TessPageSegMode)segmode);`
			`+ }`
			`+ if (!TessBaseAPIProcessPage(api,pix,0,NULL,NULL,0,NULL))`
			`+ {`
			`+ if (out!=NULL)`
			`+ fprintf(out,"tesscapi: Error during bitmap processing.\n");`
			`+ TessBaseAPIClear(api);`
			`+ (*nw)=0;`
			`+ return(-1);`
			`+ }`
			`+`
			`+ //(*nw)=api->GetOCRWords(left,top,right,bottom,ybase,text);`
			`+`
			`+ int iword,nwords,totlen,it8;`
			`+ int x0,y0,x1,y1,*ybaseline;`
			`+ char *tutf8;`
			`+`
			`+ TessResultIterator *res_it = TessBaseAPIGetIterator(api);`
			`+ /* Count words */`
			`+ iword=0;`
			`+ totlen=0;`
			`+ while(TessResultIteratorNext(res_it, RIL_BLOCK))`
			`+ {`
			`+ if(!TessResultIteratorNext(res_it, RIL_WORD))`
			`+ {`
			`+ continue;`
			`+ }`
			`+ iword++;`
			`+ char* textstr = TessResultIteratorGetUTF8Text(res_it, RIL_WORD);`
			`+ totlen+=strlen(textstr)+1;`
			`+ }`
			`+ nwords = iword;`
			`+`
			`+ x0=(left)=(int )malloc(sizeof(int)5nwords);`
			`+ y0=(*top)=&x0[nwords];`
			`+ x1=(*right)=&y0[nwords];`
			`+ y1=(*bottom)=&x1[nwords];`
			`+ ybaseline=(*ybase)=&y1[nwords];`
			`+ tutf8=(text)=(char )malloc(totlen);`
			`+ iword=0;`
			`+ it8=0;`
			`+ TessPageIteratorBegin( (TessPageIterator *) res_it);`
			`+ while (TessResultIteratorNext(res_it, RIL_BLOCK))`
			`+ {`
			`+ if (!TessResultIteratorNext(res_it, RIL_WORD))`
			`+ {`
			`+ continue;`
			`+ }`
			`+ char* textstr = TessResultIteratorGetUTF8Text(res_it, RIL_WORD);`
			`+ strcpy(&tutf8[it8],textstr);`
			`+ it8 += strlen(&tutf8[it8])+1;`
			`+`
			`+ int bbleft, bbtop, bbright, bbbottom;`
			`+ int u1,v1,u2,v2;`
			`+ TessPageIteratorBoundingBox( (TessPageIterator *) res_it, RIL_WORD, &bbleft, &bbtop, &bbright, &bbbottom);`
			`+ TessPageIteratorBaseline( (TessPageIterator *) res_it, RIL_WORD, &u1, &v1, &u2, &v2);`
			`+ x0[iword]=bbleft;`
			`+ x1[iword]=bbright;`
			`+ y0[iword]=bbtop;`
			`+ y1[iword]=bbbottom;`
			`+ ybaseline[iword]=(v1+v2)/2;`
			`+ iword++;`
			`+ }`
			`+`
			`+ TessResultIteratorDelete(res_it);`
			`+`
			`+ (*nw) = iword;`
			`+`
			`+ TessBaseAPIClear(api);`
			`+ return(0);`
			`+ }`
			`+`
			`+`
			`+void tess_capi_end(void *vapi)`
			`+`
			`+ {`
			`+ TessBaseAPI *api;`
			`+`
			`+ if (vapi==NULL)`
			`+ return;`
			`+ api=(TessBaseAPI *)vapi;`
			`+ TessBaseAPIEnd(api);`
			`+ TessBaseAPIDelete(api);`
			`+ }`