You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
508 lines
24 KiB
508 lines
24 KiB
# -*- coding: utf-8 -*-
|
|
###############################################################################
|
|
#
|
|
# Cybrosys Technologies Pvt. Ltd.
|
|
#
|
|
# Copyright (C) 2024-TODAY Cybrosys Technologies(<https://www.cybrosys.com>)
|
|
# Author: Sruthi Renjith (odoo@cybrosys.com)
|
|
#
|
|
# You can modify it under the terms of the GNU LESSER
|
|
# GENERAL PUBLIC LICENSE (LGPL v3), Version 3.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU LESSER GENERAL PUBLIC LICENSE (LGPL v3) for more details.
|
|
#
|
|
# You should have received a copy of the GNU LESSER GENERAL PUBLIC LICENSE
|
|
# (LGPL v3) along with this program.
|
|
# If not, see <http://www.gnu.org/licenses/>.
|
|
#
|
|
###############################################################################
|
|
import io
|
|
import os
|
|
from pdf2image import convert_from_bytes
|
|
from PIL import Image, ImageOps
|
|
import pytesseract
|
|
import re
|
|
import spacy
|
|
from odoo import api, fields, models, _
|
|
from odoo.exceptions import ValidationError
|
|
|
|
|
|
class OCRDataTemplate(models.TransientModel):
|
|
""" Class to read document and extract the text from JPG, JPEG, PNG and
|
|
PDF files. """
|
|
_name = "ocr.data.template"
|
|
_description = "Data retrieving template"
|
|
_rec_name = "file_name"
|
|
|
|
image = fields.Binary(string="Document", attachment=True, required=True,
|
|
help="Upload .jpg, .jpeg, .png or .pdf files")
|
|
file_name = fields.Char(string="Document Name", help="Document name")
|
|
image2 = fields.Image(string="Document",
|
|
help="Uploaded document")
|
|
flag = fields.Boolean(default=False, string="Is Read",
|
|
help="Flag to check the document read or not")
|
|
data = fields.Text(string="Data", readonly=True,
|
|
help="Content from the document")
|
|
model_name_id = fields.Many2one(
|
|
'ir.model', string="Model",
|
|
domain="["
|
|
"('model', 'in', ['res.partner', 'account.move', 'hr.employee',"
|
|
" 'hr.expense', 'sale.order', 'purchase.order'])]",
|
|
help="Model to which the data want to map")
|
|
model_field_ids = fields.Many2many(
|
|
'ir.model.fields', string="Fields",
|
|
domain="[('model_id', '=', model_name_id)]",
|
|
help="Fields names to map data")
|
|
|
|
def data_segmentation(self, img):
|
|
"""
|
|
Function to do segmentation for the retrieved data after converting it
|
|
into image.
|
|
:param img: The image format of the document that need to undergo the
|
|
segmentation procedure.
|
|
:return: The segments of the image.
|
|
"""
|
|
img = ImageOps.grayscale(img)
|
|
img = img.point(lambda x: 255 if x > 176 else 0, '1')
|
|
img_rgb = ImageOps.invert(img.convert("RGB"))
|
|
segments = []
|
|
segment_bounds = img_rgb.getbbox()
|
|
while segment_bounds:
|
|
segment = img_rgb.crop(segment_bounds)
|
|
if segment.size[0] > 0 and segment.size[1] > 0:
|
|
segments.append(segment)
|
|
img_rgb = ImageOps.crop(img_rgb, segment_bounds)
|
|
segment_bounds = img_rgb.getbbox()
|
|
return segments
|
|
|
|
def action_get_data(self):
|
|
"""
|
|
Function to get the files in .jpg, .jpeg, .png and .pdf formats.
|
|
"""
|
|
self.flag = True
|
|
split_tup = os.path.splitext(self.file_name)
|
|
try:
|
|
# Getting the file path from ir.attachments.
|
|
file_attachment = self.env["ir.attachment"].search(
|
|
['|', ('res_field', '!=', False), ('res_field', '=', False),
|
|
('res_id', '=', self.id),
|
|
('res_model', '=', 'ocr.data.template')],
|
|
limit=1)
|
|
file_path = file_attachment._full_path(file_attachment.store_fname)
|
|
segmented_data = []
|
|
# Reading files in the format .jpg, .jpeg and .png.
|
|
if split_tup[1] == '.jpg' or split_tup[1] == '.jpeg' or split_tup[
|
|
1] == '.png':
|
|
with open(file_path, mode='rb') as f:
|
|
binary_data = f.read()
|
|
img = Image.open(io.BytesIO(binary_data))
|
|
# Calling the function to do segmentation.
|
|
segmented_data = self.data_segmentation(img)
|
|
elif split_tup[1] == '.pdf':
|
|
# Reading files in the format .pdf.
|
|
with open(file_path, mode='rb') as f:
|
|
pdf_data = f.read()
|
|
pages = convert_from_bytes(pdf_data)
|
|
# Making the contents in 2 or more pages into combined page.
|
|
max_width = max(page.width for page in pages)
|
|
total_height = sum(page.height for page in pages)
|
|
resized_images = [page.resize((2400, 1800)) for page in pages]
|
|
combined_image = Image.new('RGB', (max_width, total_height))
|
|
y_offset = 0
|
|
for resized_page in resized_images:
|
|
combined_image.paste(resized_page, (0, y_offset))
|
|
y_offset += resized_page.height
|
|
# Calling the segmentation function.
|
|
segmented_data = self.data_segmentation(combined_image)
|
|
except Exception:
|
|
self.env['ocr.data.template'].search([], order="id desc",
|
|
limit=1).unlink()
|
|
raise ValidationError(_("Cannot identify data"))
|
|
# Converting the segmented image into text using pytesseract.
|
|
text = ""
|
|
for segment in segmented_data:
|
|
try:
|
|
text += pytesseract.image_to_string(segment) + "\n"
|
|
break
|
|
except Exception:
|
|
raise ValidationError(_("Data cannot be read"))
|
|
# Assigning retrieved data into text field.
|
|
self.data = text
|
|
|
|
@api.onchange('model_name_id')
|
|
def _onchange_model_name_id(self):
|
|
""" Function to update the Many2many field to empty """
|
|
self.write({'model_field_ids': [(6, 0, [])]})
|
|
|
|
def find_person_name(self):
|
|
"""
|
|
Function to find person name from the retrieved text using 'spacy'
|
|
"""
|
|
person = ''
|
|
nlp = spacy.load("en_core_web_sm")
|
|
doc = nlp(self.data)
|
|
for entity in doc.ents:
|
|
if entity.label_ == "PERSON":
|
|
person = entity.text
|
|
break
|
|
return person
|
|
|
|
def get_order_line(self, text):
|
|
"""
|
|
Function to find product lines from retrieved data using regex.
|
|
:param text: The extracted text to find the order lines from it
|
|
:return: The order lines found from text
|
|
"""
|
|
product_line_list = []
|
|
quantities = []
|
|
unit_prices = []
|
|
product_regex = r'\[?(.+?)\]?\s*(.+)\n(?:HSN/SAC Code):\s+(\d+)'
|
|
quantity_regex = r"Quantity Unit\n([\d.\s\S]+)"
|
|
unit_price_regex = r"Amount\n([\d.\s\S]+)"
|
|
# Matching the pattern with the data.
|
|
quantity_match = re.search(quantity_regex, text)
|
|
price_match = re.search(unit_price_regex, text)
|
|
if quantity_match:
|
|
quantity_unit_text = quantity_match.group(1)
|
|
# If matched finding a particular pattern for quantities
|
|
# form that group.
|
|
quantities = re.findall(r"\d+\.\d+", quantity_unit_text)
|
|
if price_match:
|
|
price_unit_text = price_match.group(1)
|
|
# If matched finding a particular pattern for unit price
|
|
# form that group.
|
|
unit_prices = re.findall(r"\d+\.\d+", price_unit_text)
|
|
# Finding the data that matches the pattern for products.
|
|
products = re.findall(product_regex, text)
|
|
number_of_product = len(products)
|
|
number_of_qty = len(quantities)
|
|
number_of_price = len(unit_prices)
|
|
# Getting the products and its corresponding quantity and price.
|
|
if number_of_product == number_of_qty == number_of_price:
|
|
product_line_list = [
|
|
{'product': products[i], 'quantity': quantities[i],
|
|
'price': unit_prices[i]}
|
|
for i in range(number_of_product)]
|
|
elif number_of_product == number_of_qty:
|
|
product_line_list = [
|
|
{'product': products[i], 'quantity': quantities[i]}
|
|
for i in range(number_of_product)]
|
|
elif number_of_product == number_of_price:
|
|
product_line_list = [
|
|
{'product': products[i], 'price': unit_prices[i]}
|
|
for i in range(number_of_product)]
|
|
elif products:
|
|
product_line_list = [{'product': products[i]} for i in range(number_of_product)]
|
|
return product_line_list
|
|
|
|
def action_process_data(self):
|
|
"""
|
|
Function to process the data after fetching it.
|
|
The fetched data are mapping into some models.
|
|
"""
|
|
phone_number = ''
|
|
email_address = ''
|
|
person = ''
|
|
phone_pattern = r'\(\d{3}\) \d{3}-\d{4}|\d{3}-\d{3}-\d{4}|\+\d{1}-\d{3}-\d{3}-\d{4}|\d{11}|P \+\d{3} \d{6}'
|
|
email_pattern = r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}'
|
|
if self.model_name_id.name == 'Contact':
|
|
# Mapping the data into Contact module by fetching person name,
|
|
# phone number and email id from data.
|
|
field_value = False
|
|
non_field_count = 0
|
|
for field in self.model_field_ids:
|
|
if field.name == 'name' or field.name == 'display_name':
|
|
person = self.find_person_name()
|
|
if not person:
|
|
raise ValidationError(_("Partner name cannot find"))
|
|
field_value = True
|
|
elif field.name == 'phone':
|
|
phone = re.findall(phone_pattern, self.data)
|
|
if phone:
|
|
phone_number = phone[0]
|
|
elif field.name == 'email':
|
|
email = re.findall(email_pattern, self.data)
|
|
if email:
|
|
email_address = email[0]
|
|
else:
|
|
non_field_count = 1
|
|
if not field_value and non_field_count == 1:
|
|
raise ValidationError(_("No data to map into the field"))
|
|
if person:
|
|
partner = self.env['res.partner'].search(
|
|
[('name', '=', person)], limit=1)
|
|
if not partner:
|
|
# Creating record in res.partner.
|
|
partner_record = self.env['res.partner'].create({
|
|
'name': person,
|
|
'email': email_address,
|
|
'phone': phone_number
|
|
})
|
|
else:
|
|
raise ValidationError(_("Partner already exist"))
|
|
else:
|
|
raise ValidationError(_("Name field is not chosen to create"
|
|
" partner"))
|
|
if partner_record:
|
|
return {
|
|
'name': "Partner",
|
|
'type': 'ir.actions.act_window',
|
|
'view_type': 'form',
|
|
'view_mode': 'form',
|
|
'res_model': 'res.partner',
|
|
'res_id': partner_record.id,
|
|
'view_id': self.env.ref('base.view_partner_form').id,
|
|
'target': 'current',
|
|
}
|
|
elif self.model_name_id.name == 'Journal Entry':
|
|
# Mapping data into Journal Entry. Creating a record in vendor bill
|
|
vendor_bill_flag = False
|
|
for field in self.model_field_ids:
|
|
# Taking the file path from ir.attachment.
|
|
if field.name == 'invoice_vendor_bill_id':
|
|
try:
|
|
file_attachment = self.env["ir.attachment"].search(
|
|
['|', ('res_field', '!=', False),
|
|
('res_field', '=', False),
|
|
('res_id', '=', self.id),
|
|
('res_model', '=', 'ocr.data.template')],
|
|
limit=1)
|
|
file_path = file_attachment._full_path(
|
|
file_attachment.store_fname)
|
|
with open(file_path, mode='rb') as f:
|
|
binary_data = f.read()
|
|
img = Image.open(io.BytesIO(binary_data))
|
|
# Resizing the image to improve the clarity.
|
|
resized_img = img.resize(
|
|
(img.width * 2, img.height * 2),
|
|
resample=Image.BICUBIC)
|
|
except Exception:
|
|
raise ValidationError(_("Can't create vendor bill"))
|
|
# Converting the image into text using OCR python package
|
|
# pytesseract.
|
|
try:
|
|
text = pytesseract.image_to_string(resized_img)
|
|
except Exception:
|
|
raise ValidationError(_("Can't create vendor bill"))
|
|
bill = self.env['digitize.bill']
|
|
# Calling the function to create vendor bill
|
|
# from model digitize.bill.
|
|
bill_record = bill.create_record(text)
|
|
return {
|
|
'name': "Bill",
|
|
'type': 'ir.actions.act_window',
|
|
'view_type': 'form',
|
|
'view_mode': 'form',
|
|
'res_model': 'account.move',
|
|
'res_id': bill_record.id,
|
|
'view_id': self.env.ref('account.view_move_form').id,
|
|
'target': 'current',
|
|
}
|
|
if not vendor_bill_flag:
|
|
raise ValidationError(_("No data to map into the field"))
|
|
elif self.model_name_id.name == 'Employee':
|
|
# Mapping the data into Employee module by fetching person name,
|
|
# phone number and email.
|
|
field_value = False
|
|
non_field_count = 0
|
|
for field in self.model_field_ids:
|
|
if field.name == 'name' or field.name == 'display_name' or \
|
|
field.name == 'emergency_contact':
|
|
person = self.find_person_name()
|
|
if not person:
|
|
raise ValidationError(_("Employee name cannot find"))
|
|
field_value = True
|
|
elif field.name == 'work_phone' or field.name == 'phone' or \
|
|
field.name == 'emergency_phone':
|
|
phone = re.findall(phone_pattern, self.data)
|
|
if phone:
|
|
phone_number = phone[0]
|
|
elif field.name == 'private_email' or \
|
|
field.name == 'work_email':
|
|
email = re.findall(email_pattern, self.data)
|
|
if email:
|
|
email_address = email[0]
|
|
else:
|
|
non_field_count = 1
|
|
if not field_value and non_field_count == 1:
|
|
raise ValidationError(_("No data to map into the field"))
|
|
if person:
|
|
partner = self.env['hr.employee'].search(
|
|
[('name', '=', person)], limit=1)
|
|
if not partner:
|
|
# Creating a record in hr.employee by mapping the
|
|
# data into employee name, work phone and work email.
|
|
employee_record = self.env['hr.employee'].create({
|
|
'name': person,
|
|
'work_email': email_address,
|
|
'work_phone': phone_number
|
|
})
|
|
else:
|
|
raise ValidationError(_("Employee already exist"))
|
|
else:
|
|
raise ValidationError(
|
|
_("Name field is not chosen to create employee"))
|
|
if employee_record:
|
|
return {
|
|
'name': "Employee",
|
|
'type': 'ir.actions.act_window',
|
|
'view_type': 'form',
|
|
'view_mode': 'form',
|
|
'res_model': 'hr.employee',
|
|
'res_id': employee_record.id,
|
|
'view_id': self.env.ref('hr.view_employee_form').id,
|
|
'target': 'current',
|
|
}
|
|
elif self.model_name_id.name == 'Expense':
|
|
# Mapping the data into Expense module.
|
|
expense_product = False
|
|
for field in self.model_field_ids:
|
|
if field.name == 'name' or field.name == 'product_id':
|
|
product = self.env['product.product'].search(
|
|
[('name', '=', 'BILL EXPENSE')], limit=1)
|
|
if not product:
|
|
product = self.env['product.product'].create({
|
|
'name': 'BILL EXPENSE',
|
|
})
|
|
pattern = r'\b\d+(?:\.\d{1,2})?\b'
|
|
matches = re.findall(pattern, self.data)
|
|
total_amount = float(matches[0]) if matches else 0.0
|
|
expense_record = self.env['hr.expense'].create({
|
|
'name': product.name,
|
|
'product_id': product.id,
|
|
'unit_amount': total_amount
|
|
})
|
|
return {
|
|
'name': "Expense",
|
|
'type': 'ir.actions.act_window',
|
|
'view_type': 'form',
|
|
'view_mode': 'form',
|
|
'res_model': 'hr.expense',
|
|
'res_id': expense_record.id,
|
|
'view_id': self.env.ref(
|
|
'hr_expense.hr_expense_view_form').id,
|
|
'target': 'current',
|
|
}
|
|
if not expense_product:
|
|
raise ValidationError(_("Can't create an expense without "
|
|
"description or category"))
|
|
elif self.model_name_id.name == 'Sales Order':
|
|
# Mapping the data from PDF with proper format into Sale Order.
|
|
partner = False
|
|
sale_order = ''
|
|
field_value = False
|
|
non_field_value = 0
|
|
for field in self.model_field_ids:
|
|
if field.name == 'order_line':
|
|
person = self.find_person_name()
|
|
if person:
|
|
partner = self.env['hr.employee'].search(
|
|
[('name', '=', person)], limit=1)
|
|
if not partner:
|
|
partner = self.env['hr.employee'].create({
|
|
'name': person,
|
|
})
|
|
# Calling the function to get order lines.
|
|
product_line = self.get_order_line(self.data)
|
|
sale_order = self.env['sale.order'].create({
|
|
'partner_id': partner.id,
|
|
})
|
|
if product_line:
|
|
for item in product_line:
|
|
if 'quantity' not in item.keys():
|
|
item.update({'quantity': 0})
|
|
if 'price' not in item.keys():
|
|
item.update({'price': 0})
|
|
product = self.env['product.product'].search(
|
|
[('name', '=', item['product'])], limit=1)
|
|
if not product:
|
|
product = self.env['product.product'].create({
|
|
'name': item['product']
|
|
})
|
|
item.update({'product': product.id})
|
|
self.env['sale.order.line'].create({
|
|
'order_id': sale_order.id,
|
|
'product_id': item['product'],
|
|
'product_uom_qty': item['quantity'],
|
|
'price_unit': item['price']
|
|
})
|
|
else:
|
|
non_field_value = 1
|
|
if sale_order:
|
|
return {
|
|
'name': "Sale order",
|
|
'type': 'ir.actions.act_window',
|
|
'view_type': 'form',
|
|
'view_mode': 'form',
|
|
'res_model': 'sale.order',
|
|
'res_id': sale_order.id,
|
|
'view_id': self.env.ref('sale.view_order_form').id,
|
|
'target': 'current',
|
|
}
|
|
if not field_value and non_field_value == 1:
|
|
raise ValidationError(_("No data to map into the field"))
|
|
elif self.model_name_id.name == 'Purchase Order':
|
|
# Mapping the data from PDF with proper format into Purchase Order.
|
|
field_value = False
|
|
purchase_order = ''
|
|
non_field_value = 0
|
|
partner = False
|
|
for field in self.model_field_ids:
|
|
if field.name == 'order_line':
|
|
person = self.find_person_name()
|
|
if person:
|
|
partner = self.env['hr.employee'].search(
|
|
[('name', '=', person)], limit=1)
|
|
if not partner:
|
|
partner = self.env['hr.employee'].create({
|
|
'name': person,
|
|
})
|
|
# Calling the function to get order lines.
|
|
product_line = self.get_order_line(self.data)
|
|
purchase_order = self.env['purchase.order'].create({
|
|
'partner_id': partner.id,
|
|
})
|
|
if product_line:
|
|
for item in product_line:
|
|
if 'quantity' not in item.keys():
|
|
item.update({'quantity': 0})
|
|
if 'price' not in item.keys():
|
|
item.update({'price': 0})
|
|
product = self.env['product.product'].search(
|
|
[('name', '=', item['product'])], limit=1)
|
|
if not product:
|
|
product = self.env['product.product'].create({
|
|
'name': item['product']
|
|
})
|
|
item.update({'product': product.id})
|
|
self.env['purchase.order.line'].create({
|
|
'order_id': purchase_order.id,
|
|
'product_id': item['product'],
|
|
'product_uom_qty': item['quantity'],
|
|
'price_unit': item['price']
|
|
})
|
|
else:
|
|
non_field_value = 1
|
|
if purchase_order:
|
|
return {
|
|
'name': "Purchase order",
|
|
'type': 'ir.actions.act_window',
|
|
'view_type': 'form',
|
|
'view_mode': 'form',
|
|
'res_model': 'purchase.order',
|
|
'res_id': purchase_order.id,
|
|
'view_id': self.env.ref(
|
|
'purchase.purchase_order_form').id,
|
|
'target': 'current',
|
|
}
|
|
if not field_value and non_field_value == 1:
|
|
raise ValidationError(_("No data to map into the field"))
|
|
|
|
@api.onchange('image')
|
|
def _onchange_image(self):
|
|
self.write({
|
|
'image2': self.image
|
|
})
|
|
|