Python code

# -*- coding: utf-8 -*-
"""
Created on Fri Jan 10 18:00:16 2019

@author: sbramhvanshi
"""


from PIL import Image, ImageDraw
import os
import csv
import re
import string
from flask import current_app as app
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from typing import Dict, List, Any, Union
import time



from PyTesseractOCR import PyTesseractOCR
#from includes.MizSpellCorrection import MizSpellCorrection

class NotesToArray:
    
    result_function = None  # type: Dict[str, Union[str, List[Any], int]]
    
    def __init__(self):
        
        self.short_text_arr = []
        self.text_position_array = []
        self.final_output_array = []
        
        self.score_label_matching_score = 75
        self.result_function = dict()
        self.result_function['status'] = "Success"
        self.result_function['message'] = "Working Great!!!"
        self.result_function['data'] = []
        self.result_function['code'] = 0
        self.threshold_for_cropping = 30
        self.fuzzy_march_score = 85
        self.initial_skip_to_avoid_serial_number = 0
        self.header_cordinates =  dict()
        self.minimum_fluctation_from_header = 99
        self.fuzzy_score_for_matching_numbers = 90
        self.ignore_words_for_position = ["-"]
        self.max_number_of_line_travel = 5
        self.space_distance_check_minimum = 30
         
          
          
    def fn_get_notes_array(self, file_path_image,input_data): 
        
      obj_text = PyTesseractOCR()  
      tsv_file_path = obj_text.fn_get_tsv_file(file_path_image, 0,'')
      
      self.text_position_array = pd.read_csv(tsv_file_path, sep='\t', quoting=csv.QUOTE_NONE, encoding='latin1')
      
      result_start = self.findTextInTsv(input_data['label'])
      start_point = result_start['data']
      
      total_values = input_data['total_values']
      
      result_headers = self.get_header_cordinates(total_values) 
      if result_headers['status'] == 'Success':
          data = result_headers['data']
          
          self.set_minimum_fluctation_from_header() 
          
          cropped_data = self.text_position_array[ (self.text_position_array['top'] >=start_point[1]) & (self.text_position_array['top'] < data[0]+self.threshold_for_cropping)]
          cropped_data=cropped_data.reset_index(drop=True) 
          #cropped_data.to_csv('cropped_data.tsv',sep='\t')
          
          self.text_position_array = cropped_data
          
          """
          output_file_path = self.fn_img_crop(start_point,data[0],file_path_image)
          tsv_file_path = obj_text.fn_get_tsv_file(output_file_path, 0,'')
          self.text_position_array = pd.read_csv(tsv_file_path, sep='\t', quoting=csv.QUOTE_NONE, encoding='latin1')
          """
          self.get_an_simple_array_from_tsv_dataframe(tsv_file_path)
          
          return self.get_array_from_image()
          
      else:
          print(result_headers)
          
    def get_array_from_image(self):      
       
      
       temp_array_final_header = ["Particulars"]
       for each_value in self.header_cordinates:
            temp_array_final_header.append(each_value)

       total_number_of_columns = len(self.header_cordinates)
  
       for i in range(0, len(self.short_text_arr), 1):
           temp_array_final = []
           for each_value in temp_array_final_header:
                temp_array_final.append("")
           each_line = self.short_text_arr[i]
           line_string = each_line[1]
           line_number = each_line[0]
           number_string = re.findall(r"\b[\d\.\,]+\b|[\b\-]+|[(]\b[\d\.\,]+\b[)]", line_string[self.initial_skip_to_avoid_serial_number:])
           if len(number_string) > 0:
              
                temp_array_final = self.update_number_values_in_row(temp_array_final, line_number, number_string)
              
                space_flag = 0
                number_string_temp = []
                try:
                 if len(number_string) >= total_number_of_columns:
                 
                    number_string_temp = self.check_remove_space_between_numbers(number_string,line_number,"+")
                    if len(number_string_temp) < len(number_string):
                        space_flag = 1
                    else:
                        if len(number_string_temp) > total_number_of_columns:
                           
                            number_string_temp = self.check_remove_space_between_numbers(number_string,line_number,"-")
                            if len(number_string_temp) < len(number_string):
                                space_flag = 1
                except:
                    print("You have an exception")
                    space_flag = 0
               
                
                #space_flag = 0
                if space_flag == 1:
                   for i in range(0, len(temp_array_final), 1):
                       if temp_array_final[i] not in number_string_temp and temp_array_final[i] != "" :
                           number_text = temp_array_final[i]
                          
                           for number_temp in number_string_temp:
                               
                               if str(number_text) in str(number_temp) and str(number_text) != str(number_temp):  #checking sub string
                                   #number_string_temp=number_string_temp.remove(number_temp)
                                   #check whether that number is present in the the further position
                                   #print(str(number_text)+"   "+str(number_temp) + "--"+str(len(temp_array_final))+"--")
                                   check_flag= 0
                                   for j in range(i, len(temp_array_final) - 1, 1):
                                       
                                       temp_text = temp_array_final[i]
                                       if str(temp_text) == str(number_text):
                                           check_flag= 1
                                       #print(str(number_text)+"   "+str(number_temp) + "--"+str(i)+"--"+str(j)+"--"+str(check_flag))
                                   
                                   if check_flag == 0:
                                       temp_array_final[i] = number_temp
                                   break
                               
               
           cell_value_particular_arr = re.findall(r"(?i)\b[a-zA-Z-.]+\b", line_string)
           cell_value_particular_arr = " ".join(cell_value_particular_arr)
           temp_array_final[0] = cell_value_particular_arr
           self.final_output_array.append(temp_array_final)
               
       
       self.result_function['data'] = self.final_output_array
       self.result_function['data_header'] = self.header_cordinates 
       return self.result_function    
    def check_remove_space_between_numbers(self,number_string,line_number,direction):
        row_from_tsv= self.text_position_array[self.text_position_array['line_num'] == line_number]
        match_count = 0
        number_string_temp = []
        for index, row_char in row_from_tsv.iterrows():
            single_text_from_line = row_char['text']
            for number in number_string:
                single_character_match_score = fuzz.token_sort_ratio(str(single_text_from_line), str(number))
                if single_character_match_score >= 95:
                    match_count = match_count + 1
                    break
            
        max_match_count = len(number_string)
        ignore_value_arrays = [".","-"]
        temp_number_string = number_string
        for each_value in ignore_value_arrays:
            if each_value in temp_number_string: 
               temp_number_string.remove(each_value)
        max_match_count = len(temp_number_string)
       
        
        if match_count >= max_match_count:
            
            i = 1
            row_char = self.getexact_row_from_tsv_of_text(row_from_tsv,number_string[0])
            if row_char is None:
                number_string_temp.append(number_string[0])
                row_char = self.getexact_row_from_tsv_of_text(row_from_tsv,number_string[1])
                i = 2
                              
            position_of_number =  row_char['left']
            number_text = row_char['text']
            width = row_char['width']
          
            
           
            for j in range(i, len(number_string), 1):
                if  i >= len(number_string):
                    break
               
                row_char = self.getexact_row_from_tsv_of_text(row_from_tsv,number_string[i])
                
                
                difference = row_char['left'] - (position_of_number+width)
                if difference < 0:
                    difference = difference * -1
                #print(str(row_char['text']) +"<==>"+str(number_text)+"<--->"+str(difference))
                #print(str(row_char['left'])+"<-->"+str(position_of_number)+"<--->"+str(row_char['width']))
                
                if difference < self.space_distance_check_minimum and row_char['text'] != str(number_text) :
                    new_number_corrected = str(number_text)+str(row_char['text'])
                    number_string_temp.append(new_number_corrected)
                    i=i+1
                  
                    if  i >= len(number_string):
                      break
                    row_char = self.getexact_row_from_tsv_of_text(row_from_tsv,number_string[i])
                    position_of_number = row_char['left']
                    number_text = row_char['text']
                    width = row_char['width']
                    i=i+1
                   
                else:    
                    number_string_temp.append(number_text)
                    position_of_number = row_char['left']
                    number_text = number_string[i]
                    width = row_char['width']
                    i=i+1
            
            return number_string_temp
            
        if line_number < len(self.short_text_arr) and direction == "+":
            line_number = line_number + 1
            return self.check_remove_space_between_numbers(number_string,line_number,"+")
        if line_number > 0 and direction == "-":
            line_number = line_number - 1
            return self.check_remove_space_between_numbers(number_string,line_number,"-")
            
        return number_string
    
    def getexact_row_from_tsv_of_text(self,row_from_tsv,number):
        
        for index, row_char in row_from_tsv.iterrows():
            single_text_from_line = row_char['text']
            single_character_match_score = fuzz.token_sort_ratio(str(single_text_from_line), str(number))
            
            if single_character_match_score >= 95:
               return row_char
           
    def update_number_values_in_row(self, temp_array_final, line_number, number_string):
        
        column_count = 1
       
        position_value_result = self.get_best_possible_position_of_numbers(line_number, number_string,"+",line_number)
      
        # this constant assumes first column is for Major , Minor , Particulars , NOte
        if len(position_value_result) > 1:
            for each_column in self.header_cordinates:
                if each_column in position_value_result:
                    try:
                        temp_array_final[column_count] = position_value_result[each_column]
                    except:
                        print(position_value_result)
                column_count = column_count + 1
        else:
            column_count = 1
            position_value_result = self.get_best_possible_position_of_numbers(line_number, number_string,"-",line_number)
            if len(position_value_result) > 1:
                for each_column in self.header_cordinates:
                    if each_column in position_value_result:
                        try:
                            temp_array_final[column_count] = position_value_result[each_column]
                        except:
                            print(position_value_result)
                    column_count = column_count + 1
        return temp_array_final
    
    def get_best_possible_position_of_numbers(self, line_number, numbers, direction,orignal_line_number):
        position_value = dict()
        position_value['result'] = 'false'
        position_value['position'] = 0
        position_value['line_number'] = 1
        position_result = dict()
        position_result['line_number'] = line_number
        count = 1
        #this keywords are ignored while considering a match. Since they are saame and present in many placees
       
        #this temp line number is written to ensure that even if the 3rd value is 0 , it will take the best value
        # generally when the third value is zero line number becoes 0
        temp_line_number = 0
        for each_number in numbers:
   
            position_value = self.get_best_possible_position_of_individual_number(line_number, each_number, position_result)
           
            if position_value['result'] == 'true':
               
                if position_value['flag'] == 2:
                    position_result[position_value['header']] = each_number
                else:
                    position_result[position_value['header']] = each_number
                
                if each_number not in self.ignore_words_for_position:
                    count = count + 1
                temp_line_number  = position_value['line_number']
        
     
        if count > 1:
            position_result['line_number'] = temp_line_number
            return position_result
        else:
            difference_line_travelled = line_number - orignal_line_number
            if difference_line_travelled  < 0:
                difference_line_travelled = difference_line_travelled * -1
            if difference_line_travelled > self.max_number_of_line_travel:
                return position_result
            if line_number < len(self.short_text_arr) and direction == "+":
                line_number = line_number + 1
                position_result = self.get_best_possible_position_of_numbers(line_number, numbers,
                                                                             direction,orignal_line_number)
                return position_result
            if line_number > 0 and direction == "-":
                line_number = line_number - 1
                position_result = self.get_best_possible_position_of_numbers(line_number, numbers,
                                                                             direction,orignal_line_number)
                return position_result
        return position_result
    
    def get_best_possible_position_of_individual_number(self, line_number, number, position_result):

        position_value = dict()
        position_value['result'] = 'false'
        position_value['position'] = 0
        position_value['line_number'] = line_number
        position_left = 0
        header_with_relative_position = self.header_cordinates
        line_number_tsv = line_number
        header_row = self.text_position_array[self.text_position_array['line_num'] == line_number_tsv]
        
        for index, row_char in header_row.iterrows():
            single_text_from_line = row_char['text']
            flag = 0
            width = row_char['width']
            if str(number) == str(single_text_from_line) :
               position_left = row_char['left'] + width/2
               flag = 1
            else:
                
                text_match = fuzz.token_sort_ratio(str(number), str(single_text_from_line))
                if text_match > self.fuzzy_score_for_matching_numbers:
                    width = row_char['width']
                    position_left = row_char['left'] + width/2
                    flag = 2
            
                
                
            # minimum_fluctation_from_header
            comparision_array = dict()
       
            if flag > 0:
               
                for each_column in header_with_relative_position:

                    diff = position_left - header_with_relative_position[each_column]
                    if diff < 0:
                        diff = diff * -1
                    comparision_array[each_column] = diff    
                        
              
                comparision_array =sorted(comparision_array.items(),key=lambda x: x[1])
                
                comparision_value = comparision_array[0][0]
                #print(str(single_text_from_line) +"=="+str(flag)+"=="+str(position_left))
                #print(self.minimum_fluctation_from_header)
                #print(comparision_array)
                
                if comparision_value not in position_result and comparision_array[0][1] < self.minimum_fluctation_from_header:
                  
                    position_value['result'] = 'true'
                    position_value['header'] = comparision_value
                    position_value['line_number'] = line_number_tsv
                    position_value['text_file_value'] = str(number)
                    position_value['tsv_file_value'] = str(single_text_from_line)
                    position_value['flag'] = flag
                    return position_value   
                else:
                  if comparision_value in position_result and  position_result[comparision_value] in self.ignore_words_for_position and comparision_array[0][1] < self.minimum_fluctation_from_header :
                            position_value['result'] = 'true'
                            position_value['header'] = comparision_array[0][0]
                            position_value['line_number'] = line_number_tsv
                            position_value['text_file_value'] = str(number)
                            position_value['tsv_file_value'] = str(single_text_from_line)
                            position_value['flag'] = flag
                            return position_value                          
            
        return position_value
    def set_minimum_fluctation_from_header(self):
        header_with_relative_position = self.header_cordinates
      
        temp_array_headers = []
        
        for each_value in header_with_relative_position:
            temp_array_headers.append(header_with_relative_position[each_value])
        
        if len(temp_array_headers) >= 1:
            difference = temp_array_headers[1] - temp_array_headers[0]
            self.minimum_fluctation_from_header = difference/2
            
          
            
    def find_index(self,key):
         for index, element in enumerate(self.short_text_arr):
             if element[0] == key:
                 return [index,element]
         return 0      
    def get_an_simple_array_from_tsv_dataframe(self,file_path):
        self.short_text_arr = []
        df = self.text_position_array
        row_nums = df.line_num.unique()  
        row_nums = sorted(row_nums)
        print(max(row_nums))
        count = 0
        for i in range(1, max(row_nums)+1, 1):
            self.short_text_arr.append([i, ""])
        
        for row_num in row_nums:
            row_df = df[df['line_num']==row_num]
            line_string = ''
            
            for index, row_char in row_df.iterrows():
                if str(row_char['text']) != 'nan':
                    line_string = line_string + str(row_char['text']) + ' '
             
            remove = string.punctuation  
            march_match = fuzz.partial_ratio(line_string, "March")
            # if(march_match[0][1] >= self.fuzzy_march_score):
            # remove = remove.replace(",", "")
            if march_match >= self.fuzzy_march_score:
                remove = remove.replace(",", "")

            remove = remove.replace("-.", "")  # don't remove hyphens
            remove = remove.replace(",", "")  # don't remove comma
            remove = remove.replace("()", "")  # don't remove parenthesis
            pattern = r"[{}]".format(remove)  # create the pattern
            
            line_string = re.sub(pattern, "", line_string)

            # this condition is required to match the tsv output of positioning and text
            # text file given different line number and tsv gives different. If solution found
            # then we need to remove this condition
            line_string = re.sub('[^a-zA-Z0-9-.,()]',' ',line_string)
            line_string = line_string.replace('  ',' ').lstrip().rstrip()
            temp_string = line_string
            if temp_string.strip() != "":
                #self.short_text_arr.append([row_num, line_string])
                result = self.find_index(row_num)
                
                if result:
                   
                    self.short_text_arr.remove(result[1])
                    self.short_text_arr.insert(result[0],[row_num,line_string])
                #self.short_text_arr[row_num] = line_string
            
            count = count + 1
            
        
        print(self.short_text_arr)
        dir_path= os.path.split(os.path.abspath(os.path.splitext(file_path)[0]))
        out_filepath = os.path.join(dir_path[0], dir_path[1] + 'short_text.txt') 
        with open(out_filepath, "w") as txt_file:
            for line in self.short_text_arr:
                txt_file.write("".join(str(line)) + "\n") # works with any number of elements in a line   
    
    def get_header_cordinates(self,total_values):
        
        counter = 0
        average_cropping_cordinate_y = 0
       
        for each_value in total_values:
          
          result_end = self.findNumberInTsv(total_values[each_value])
          
          if result_end['status'] == 'Success':
              end_point = result_end['data']
              average_cropping_cordinate_y = average_cropping_cordinate_y + end_point[1]
              self.header_cordinates[each_value] = end_point[0]
              counter = counter + 1
              
        if counter != 0:
                 average_cropping_cordinate_y =   average_cropping_cordinate_y/counter
                 self.result_function['status'] = "Success"
                 self.result_function['message'] = "Cordinates Fetch Success Fully. Working Great!!!"
                 self.result_function['data'][0] = average_cropping_cordinate_y
                 self.result_function['data'][1] = self.header_cordinates
                 self.result_function['code'] = 0
                 return self.result_function
        else:
             self.result_function['status'] = "Error"
             self.result_function['message'] = "Sorry Unable to find header cordinates"
             self.result_function['data'] = []
             self.result_function['code'] = "NTA02"
             return self.result_function
           
     
          
    def fn_img_crop(self,start,end,file_path_image):
        
        file = file_path_image
        
        pathWithoutFileExt = os.path.split(os.path.abspath(os.path.splitext(file)[0]))
        fileName=pathWithoutFileExt[1] 
        filepath = os.path.join(os.path.dirname(file_path_image), "croppedImage") 
        if not os.path.exists(filepath):
            os.makedirs(filepath)
        out_filepath = os.path.join( filepath, fileName  + '.png')
        
        im = Image.open(file)
        im_width, im_height = im.size
        im = im.crop((0,start[1], im_width,end + self.threshold_for_cropping))  # (left, upper, right, lower)-tuple. 
        im.save(out_filepath) 
        return out_filepath
    
    def findNumberInTsv(self,text_value):
        
        df = self.text_position_array
        row_nums = df.line_num.unique()  
        row_nums = sorted(row_nums)
        x_cordinate  = 0
        y_cordinate = 0
        for row_num in row_nums:
            row_df = df[df['line_num']==row_num]
           
            
            for index, row_char in row_df.iterrows():
                
                text_match = fuzz.token_sort_ratio(str(text_value), str(row_char['text'])) 
            
                if text_match > self.score_label_matching_score:
                    x_cordinate = row_char['left']
                    y_cordinate = row_char['top']
                    self.result_function['status'] = "Success"
                    self.result_function['message'] = "Found: "+text_value+" Working Great!!!"
                    self.result_function['data'] = [x_cordinate,y_cordinate]
                    self.result_function['code'] = 0
                    return self.result_function
            
            
        self.result_function['status'] = "Error"
        self.result_function['message'] = "Match Not Found"
        self.result_function['data'] = [x_cordinate,y_cordinate]
        self.result_function['code'] = "NTA02"
        return self.result_function
    
    def findTextInTsv(self,text_value):
        
        df = self.text_position_array
        row_nums = df.line_num.unique()  
        row_nums = sorted(row_nums)
        x_cordinate  = 0
        y_cordinate = 0
        for row_num in row_nums:
            row_df = df[df['line_num']==row_num]
            line_string = ''
            
            
            for index, row_char in row_df.iterrows():
                if str(row_char['text']) != 'nan':
                    line_string = line_string + str(row_char['text']) + ' '
            text_match = fuzz.token_sort_ratio(text_value, str(line_string)) 
            
            if text_match > self.score_label_matching_score:
                x_cordinate = row_char['left']
                y_cordinate = row_char['top']
                self.result_function['status'] = "Success"
                self.result_function['message'] = "Found: "+text_value+" Working Great!!!"
                self.result_function['data'] = [x_cordinate,y_cordinate]
                self.result_function['code'] = 0
                return self.result_function
            
            
        self.result_function['status'] = "Error"
        self.result_function['message'] = "Match Not Found"
        self.result_function['data'] = [x_cordinate,y_cordinate]
        self.result_function['code'] = "NTA01"
    


file_path_image_test = "images\\notes\\\cbs_pages\\cbs_pages-4.jpg"
file_path_image_output = "images\\notes\\\cbs_pages\\cbs_pages-4.png"

document_type = "ccf" # cbs , cpl
start = time.time()
obj = NotesToArray()
#result = obj.fn_get_formatted_array(file_path_image_test,document_type, 0, 0)
obj_text = PyTesseractOCR()
obj.cropping_mode = 0
obj_text.skrew_image_tuning(file_path_image_test,file_path_image_output,'')

input_data = dict()
input_data['label'] = "Short Terms Loans and Advances"
total_values = dict()
total_values['2018'] = "5,12,98,507"
total_values['2017'] = "2,85,82,424"
input_data['total_values'] = total_values

result = obj.fn_get_notes_array(file_path_image_output,input_data)

print('end'+str(time.time()-start))


if result['status'] == 'Success':
    pd.DataFrame(result["data"][1:], columns=result["data"][0]).to_csv("file_final_output.csv")
else:
   print(result)   
(Visited 54 times, 1 visits today)

Leave A Comment

Your email address will not be published. Required fields are marked *