You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

556 lines
24 KiB

# -*- coding: utf-8 -*-
###############################################################################
#
# Cybrosys Technologies Pvt. Ltd.
#
# Copyright (C) 2024-TODAY Cybrosys Technologies(<https://www.cybrosys.com>)
# Author: Sruthi Renjith (odoo@cybrosys.com)
#
# You can modify it under the terms of the GNU AFFERO
# GENERAL PUBLIC LICENSE (AGPL v3), Version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU AFFERO GENERAL PUBLIC LICENSE (AGPL v3) for more details.
#
# You should have received a copy of the GNU AFFERO GENERAL PUBLIC LICENSE
# (AGPL v3) along with this program.
# If not, see <http://www.gnu.org/licenses/>.
#
###############################################################################
import io
import os
import pytesseract
import re
import spacy
from pdf2image import convert_from_bytes
from PIL import Image, ImageOps
from odoo import api, fields, models, _
from odoo.exceptions import ValidationError
class OCRDataTemplate(models.TransientModel):
"""Class to read document and extract the text from JPG, JPEG, PNG and
PDF files."""
_name = "ocr.data.template"
_description = "Data Retrieving Template"
_rec_name = "file_name"
image = fields.Binary(
string="Document", required=True, help="Upload .jpg, .jpeg, .png or .pdf files"
)
file_name = fields.Char(string="Document Name", help="Name of document")
image2 = fields.Image(string="Document", help="Uploaded document", store=True)
flag = fields.Boolean(
string="Flag", default=False, help="Flag to check document read or not"
)
data = fields.Text(string="Data", readonly=True, help="Content from the document")
model_name_id = fields.Many2one(
"ir.model",
string="Model",
domain="[('model', 'in', ['res.partner', 'account.move', "
"'hr.employee', 'hr.expense', 'sale.order', "
"'purchase.order'])]",
help="Model to which the data want to map",
)
model_field_ids = fields.Many2many(
"ir.model.fields",
string="Fields",
domain="[('model_id', '=', model_name_id)]",
help="Fields names to map data",
)
def data_segmentation(self, img):
"""
Function to do segmentation for the retrieved data after converting it
into image
"""
img = ImageOps.grayscale(img)
threshold_value = 176
img = img.point(lambda x: 255 if x > threshold_value else 0, "1")
img_rgb = ImageOps.invert(img.convert("RGB"))
segments = []
segment_bounds = img_rgb.getbbox()
while segment_bounds:
segment = img_rgb.crop(segment_bounds)
if segment.size[0] > 0 and segment.size[1] > 0:
segments.append(segment)
img_rgb = ImageOps.crop(img_rgb, segment_bounds)
segment_bounds = img_rgb.getbbox()
return segments
def action_get_data(self):
"""
Function to get the files in .jpg, .jpeg, .png and .pdf formats
"""
split_tup = os.path.splitext(self.file_name)
try:
# Getting the file path from ir.attachments
file_attachment = self.env["ir.attachment"].search(
[
"|",
("res_field", "!=", False),
("res_field", "=", False),
("res_id", "=", self.id),
("res_model", "=", "ocr.data.template"),
],
limit=1,
)
file_path = file_attachment._full_path(file_attachment.store_fname)
segmented_data = []
# Reading files in the format .jpg, .jpeg and .png
if (
split_tup[1] == ".jpg"
or split_tup[1] == ".jpeg"
or split_tup[1] == ".png"
):
with open(file_path, mode="rb") as f:
binary_data = f.read()
img = Image.open(io.BytesIO(binary_data))
# Calling the function to do segmentation
segmented_data = self.data_segmentation(img)
elif split_tup[1] == ".pdf":
# Reading files in the format .pdf
with open(file_path, mode="rb") as f:
pdf_data = f.read()
pages = convert_from_bytes(pdf_data)
# Making the contents in 2 or more pages into combined page
max_width = max(page.width for page in pages)
total_height = sum(page.height for page in pages)
resized_images = []
for page in pages:
resized_page = page.resize((2400, 1800))
resized_images.append(resized_page)
combined_image = Image.new("RGB", (max_width, total_height))
y_offset = 0
for resized_page in resized_images:
combined_image.paste(resized_page, (0, y_offset))
y_offset += resized_page.height
# Calling the segmentation function
segmented_data = self.data_segmentation(combined_image)
except Exception:
self.env["ocr.data.template"].search([], order="id desc", limit=1).unlink()
raise ValidationError(_("Cannot identify data"))
# Converting the segmented image into text using pytesseract
text = ""
for segment in segmented_data:
try:
text += pytesseract.image_to_string(segment) + "\n"
break
except Exception:
print("Could not convert")
raise ValidationError(_("Data cannot be read"))
# Assigning retrieved data into text field
self.data = text
self.flag = True
@api.onchange("model_name_id")
def onchange_model_name_id(self):
"""Function to update the Many2many field to empty"""
self.write({"model_field_ids": [(6, 0, [])]})
def find_person_name(self):
"""
Function to find person name from the retrieved text using 'spacy'
"""
person = ""
nlp = spacy.load("en_core_web_sm")
doc = nlp(self.data)
for entity in doc.ents:
if entity.label_ == "PERSON":
person = entity.text
break
return person
def get_order_line(self, text):
"""
Function to find product lines from retrieved data using regex
"""
product_line_list = []
quantities = []
unit_prices = []
product_regex = r"\[?(.+?)\]?\s*(.+)\n(?:HSN/SAC Code):\s+(\d+)"
quantity_regex = r"Quantity Unit\n([\d.\s\S]+)"
unit_price_regex = r"Amount\n([\d.\s\S]+)"
# Matching the pattern with the data
quantity_match = re.search(quantity_regex, text)
price_match = re.search(unit_price_regex, text)
if quantity_match:
quantity_unit_text = quantity_match.group(1)
# If matched finding a particular pattern for quantities
# form that group
quantities = re.findall(r"\d+\.\d+", quantity_unit_text)
if price_match:
price_unit_text = price_match.group(1)
# If matched finding a particular pattern for unit price
# form that group
unit_prices = re.findall(r"\d+\.\d+", price_unit_text)
# Finding the data that matches the pattern for products
products = re.findall(product_regex, text)
number_of_product = len(products)
number_of_qty = len(quantities)
number_of_price = len(unit_prices)
# Getting the products and its corresponding quantity and price
if number_of_product == number_of_qty == number_of_price:
for i in range(number_of_product):
product_line_list.append(
{
"product": products[i],
"quantity": quantities[i],
"price": unit_prices[i],
}
)
elif number_of_product == number_of_qty:
for i in range(number_of_product):
product_line_list.append(
{"product": products[i], "quantity": quantities[i]}
)
elif number_of_product == number_of_price:
for i in range(number_of_product):
product_line_list.append(
{"product": products[i], "price": unit_prices[i]}
)
elif products:
for i in range(number_of_product):
product_line_list.append({"product": products[i]})
return product_line_list
def action_process_data(self):
"""
Function to process the data after fetching it.
The fetched data are mapping into some models.
"""
phone_number = ""
email_address = ""
person = ""
phone_pattern = r"\(\d{3}\) \d{3}-\d{4}|\d{3}-\d{3}-\d{4}|\+\d{1}-\d{3}-\d{3}-\d{4}|\d{11}|P \+\d{3} \d{6}"
email_pattern = r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"
if self.model_name_id.name == "Contact":
# Mapping the data into Contact module by fetching person name,
# phone number and email id from data
field_value = False
non_field_count = 0
for field in self.model_field_ids:
if field.name == "name" or field.name == "display_name":
person = self.find_person_name()
if not person:
raise ValidationError(_("Partner name cannot find"))
field_value = True
elif field.name == "phone":
phone = re.findall(phone_pattern, self.data)
if phone:
phone_number = phone[0]
elif field.name == "email":
email = re.findall(email_pattern, self.data)
if email:
email_address = email[0]
else:
non_field_count = 1
if not field_value and non_field_count == 1:
raise ValidationError(_("No data to map into the field"))
if person:
partner = self.env["res.partner"].search(
[("name", "=", person)], limit=1
)
if not partner:
# Creating record in res.partner
partner_record = self.env["res.partner"].create(
{"name": person, "email": email_address, "phone": phone_number}
)
else:
raise ValidationError(_("Partner already exist"))
else:
raise ValidationError(
_("Name field is not chosen to create" " partner")
)
if partner_record:
return {
"name": "Partner",
"type": "ir.actions.act_window",
"view_type": "form",
"view_mode": "form",
"res_model": "res.partner",
"res_id": partner_record.id,
"view_id": self.env.ref("base.view_partner_form").id,
"target": "current",
}
elif self.model_name_id.name == "Journal Entry":
# Mapping data into Journal Entry. Creating a record in vendor bill
vendor_bill_flag = False
for field in self.model_field_ids:
# Taking the file path from ir.attachment
if field.name == "invoice_vendor_bill_id":
vendor_bill_flag = True
try:
file_attachment = self.env["ir.attachment"].search(
[
"|",
("res_field", "!=", False),
("res_field", "=", False),
("res_id", "=", self.id),
("res_model", "=", "ocr.data.template"),
],
limit=1,
)
file_path = file_attachment._full_path(
file_attachment.store_fname
)
with open(file_path, mode="rb") as f:
binary_data = f.read()
img = Image.open(io.BytesIO(binary_data))
# Resizing the image to improve the clarity
resized_img = img.resize(
(img.width * 2, img.height * 2), resample=Image.BICUBIC
)
except Exception:
raise ValidationError(_("Can't create vendor bill"))
# Converting the image into text using OCR python package
# pytesseract
try:
text = pytesseract.image_to_string(resized_img)
except Exception:
raise ValidationError(_("Can't create vendor bill"))
bill = self.env["digitize.bill"]
# Calling the function to create vendor bill
# from model digitize.bill
bill_record = bill.create_record(text)
return {
"name": "Bill",
"type": "ir.actions.act_window",
"view_type": "form",
"view_mode": "form",
"res_model": "account.move",
"res_id": bill_record.id,
"view_id": self.env.ref("account.view_move_form").id,
"target": "current",
}
if not vendor_bill_flag:
raise ValidationError(_("No data to map into the field"))
elif self.model_name_id.name == "Employee":
# Mapping the data into Employee module by fetching person name,
# phone number and email
field_value = False
non_field_count = 0
for field in self.model_field_ids:
if (
field.name == "name"
or field.name == "display_name"
or field.name == "emergency_contact"
):
person = self.find_person_name()
if not person:
raise ValidationError(_("Employee name cannot find"))
field_value = True
elif (
field.name == "work_phone"
or field.name == "phone"
or field.name == "emergency_phone"
):
phone = re.findall(phone_pattern, self.data)
if phone:
phone_number = phone[0]
elif field.name == "private_email" or field.name == "work_email":
email = re.findall(email_pattern, self.data)
if email:
email_address = email[0]
else:
non_field_count = 1
if not field_value and non_field_count == 1:
raise ValidationError(_("No data to map into the field"))
if person:
partner = self.env["hr.employee"].search(
[("name", "=", person)], limit=1
)
if not partner:
# Creating a record in hr.employee by mapping the
# data into employee name, work phone and work email
employee_record = self.env["hr.employee"].create(
{
"name": person,
"work_email": email_address,
"work_phone": phone_number,
}
)
else:
raise ValidationError(_("Employee already exist"))
else:
raise ValidationError(_("Name field is not chosen to create employee"))
if employee_record:
return {
"name": "Employee",
"type": "ir.actions.act_window",
"view_type": "form",
"view_mode": "form",
"res_model": "hr.employee",
"res_id": employee_record.id,
"view_id": self.env.ref("hr.view_employee_form").id,
"target": "current",
}
elif self.model_name_id.name == "Expense":
# Mapping the data into Expense module
expense_product = False
for field in self.model_field_ids:
if field.name == "name" or field.name == "product_id":
expense_product = True
product = self.env["product.product"].search(
[("name", "=", "BILL EXPENSE")], limit=1
)
if not product:
product = self.env["product.product"].create(
{
"name": "BILL EXPENSE",
}
)
expense_record = self.env["hr.expense"].create(
{
"product_id": product.id,
}
)
return {
"name": "Expense",
"type": "ir.actions.act_window",
"view_type": "form",
"view_mode": "form",
"res_model": "hr.expense",
"res_id": expense_record.id,
"view_id": self.env.ref("hr_expense.hr_expense_view_form").id,
"target": "current",
}
if not expense_product:
raise ValidationError(
_("Can't create an expense without " "description or category")
)
elif self.model_name_id.name == "Sales Order":
# Mapping the data from PDF with proper format into Sale Order
sale_order = ""
partner = False
field_value = False
non_field_value = 0
for field in self.model_field_ids:
if field.name == "order_line":
field_value = True
person = self.find_person_name()
if person:
partner = self.env["hr.employee"].search(
[("name", "=", person)], limit=1
)
if not partner:
partner = self.env["hr.employee"].create(
{
"name": person,
}
)
# Calling the function to get order lines
product_line = self.get_order_line(self.data)
sale_order = self.env["sale.order"].create(
{
"partner_id": partner.id,
}
)
if product_line:
for item in product_line:
if "quantity" not in item.keys():
item.update({"quantity": 0})
if "price" not in item.keys():
item.update({"price": 0})
product = self.env["product.product"].search(
[("name", "=", item["product"])], limit=1
)
if not product:
product = self.env["product.product"].create(
{"name": item["product"]}
)
item.update({"product": product.id})
self.env["sale.order.line"].create(
{
"order_id": sale_order.id,
"product_id": item["product"],
"product_uom_qty": item["quantity"],
"price_unit": item["price"],
}
)
else:
non_field_value = 1
if sale_order:
return {
"name": "Sale order",
"type": "ir.actions.act_window",
"view_type": "form",
"view_mode": "form",
"res_model": "sale.order",
"res_id": sale_order.id,
"view_id": self.env.ref("sale.view_order_form").id,
"target": "current",
}
if not field_value and non_field_value == 1:
raise ValidationError(_("No data to map into the field"))
elif self.model_name_id.name == "Purchase Order":
# Mapping the data from PDF with proper format into Purchase Order
purchase_order = ""
field_value = False
non_field_value = 0
partner = False
for field in self.model_field_ids:
if field.name == "order_line":
field_value = True
person = self.find_person_name()
if person:
partner = self.env["hr.employee"].search(
[("name", "=", person)], limit=1
)
if not partner:
partner = self.env["hr.employee"].create(
{
"name": person,
}
)
# Calling the function to get order lines
product_line = self.get_order_line(self.data)
purchase_order = self.env["purchase.order"].create(
{
"partner_id": partner.id,
}
)
if product_line:
for item in product_line:
if "quantity" not in item.keys():
item.update({"quantity": 0})
if "price" not in item.keys():
item.update({"price": 0})
product = self.env["product.product"].search(
[("name", "=", item["product"])], limit=1
)
if not product:
product = self.env["product.product"].create(
{"name": item["product"]}
)
item.update({"product": product.id})
self.env["purchase.order.line"].create(
{
"order_id": purchase_order.id,
"product_id": item["product"],
"product_uom_qty": item["quantity"],
"price_unit": item["price"],
}
)
else:
non_field_value = 1
if purchase_order:
return {
"name": "Purchase order",
"type": "ir.actions.act_window",
"view_type": "form",
"view_mode": "form",
"res_model": "purchase.order",
"res_id": purchase_order.id,
"view_id": self.env.ref("purchase.purchase_order_form").id,
"target": "current",
}
if not field_value and non_field_value == 1:
raise ValidationError(_("No data to map into the field"))
@api.onchange("image")
def _onchange_image(self):
self.write({"image2": self.image})