Thiết kế website giá rẻ

Question

I am working on a Django project where I need to extract text from images using Tesseract OCR. The images are often blurred and low in contrast, and contain various text blocks such as headings, dates, small letters, bold letters, and sometimes even blurred letters. The image dimensions are approximately 256 x 350 pixels with a resolution of 96 DPI.

import os
from django.shortcuts import render
from django.core.files.storage import FileSystemStorage
import pytesseract
import cv2
from PIL import Image
import numpy as np
import re

# Set Tesseract command
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

def clean_text(text):
    # Example of cleaning common OCR errors
    text = re.sub(r's+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^ws,.!?;:()"']', '', text)  # Keep common punctuation
    return text.strip()

def home(request):
    return render(request, 'layout/index.html')

def preprocess_image(image_path):
    # Read the image
    image = cv2.imread(image_path, cv2.IMREAD_COLOR)
    if image is None:
        raise ValueError(f"Error loading image: {image_path}")
    
    # Use the red channel to improve contrast
    red_channel = image[:, :, 2]

    # Convert to grayscale (using red channel)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Manually stretch the histogram to improve contrast
    # Histogram stretching
    min_val = np.min(gray)
    max_val = np.max(gray)
    stretched_gray = ((gray - min_val) / (max_val - min_val) * 255).astype(np.uint8)
    
    # If text is still blurred, apply unsharp masking
    blurred = cv2.GaussianBlur(stretched_gray, (9, 9), 10.0)
    unsharp_image = cv2.addWeighted(stretched_gray, 1.5, blurred, -0.5, 0)
    
    # Apply adaptive thresholding
    binary = cv2.adaptiveThreshold(unsharp_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    
    # Remove noise using median filter
    denoised = cv2.medianBlur(binary, 3)
    
    return denoised

def upload(request):
    if request.method == 'POST' and request.FILES.get('document'):
        document = request.FILES['document']
        fs = FileSystemStorage()
        filename = fs.save(document.name, document)
        uploaded_file_url = fs.url(filename)

        # Preprocess the uploaded document
        try:
            preprocessed_image = preprocess_image(fs.path(filename))
        except ValueError as e:
            return render(request, 'layout/index.html', {'error': str(e)})

        # Convert the processed image to PIL format
        preprocessed_image_pil = Image.fromarray(preprocessed_image)

        # Extract text using Tesseract
        custom_config = r'--oem 3 --psm 12 --dpi 1500'
        text = pytesseract.image_to_string(preprocessed_image_pil, config=custom_config)

        # Clean the extracted text
        text = clean_text(text)

        context = {
            'uploaded_file_url': uploaded_file_url,
            'text': text,
        }
        return render(request, 'layout/index.html', context)
    return render(request, 'layout/index.html')

import os
from django.shortcuts import render
from django.core.files.storage import FileSystemStorage
import pytesseract
import cv2
from PIL import Image
import numpy as np
import re

# Set Tesseract command
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

def clean_text(text):
    # Example of cleaning common OCR errors
    text = re.sub(r's+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^ws,.!?;:()"']', '', text)  # Keep common punctuation
    return text.strip()

def home(request):
    return render(request, 'layout/index.html')

def preprocess_image(image_path):
    # Read the image
    image = cv2.imread(image_path, cv2.IMREAD_COLOR)
    if image is None:
        raise ValueError(f"Error loading image: {image_path}")
    
    # Use the red channel to improve contrast
    red_channel = image[:, :, 2]

    # Convert to grayscale (using red channel)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Manually stretch the histogram to improve contrast
    # Histogram stretching
    min_val = np.min(gray)
    max_val = np.max(gray)
    stretched_gray = ((gray - min_val) / (max_val - min_val) * 255).astype(np.uint8)
    
    # If text is still blurred, apply unsharp masking
    blurred = cv2.GaussianBlur(stretched_gray, (9, 9), 10.0)
    unsharp_image = cv2.addWeighted(stretched_gray, 1.5, blurred, -0.5, 0)
    
    # Apply adaptive thresholding
    binary = cv2.adaptiveThreshold(unsharp_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    
    # Remove noise using median filter
    denoised = cv2.medianBlur(binary, 3)
    
    return denoised

def upload(request):
    if request.method == 'POST' and request.FILES.get('document'):
        document = request.FILES['document']
        fs = FileSystemStorage()
        filename = fs.save(document.name, document)
        uploaded_file_url = fs.url(filename)

        # Preprocess the uploaded document
        try:
            preprocessed_image = preprocess_image(fs.path(filename))
        except ValueError as e:
            return render(request, 'layout/index.html', {'error': str(e)})

        # Convert the processed image to PIL format
        preprocessed_image_pil = Image.fromarray(preprocessed_image)

        # Extract text using Tesseract
        custom_config = r'--oem 3 --psm 12 --dpi 1500'
        text = pytesseract.image_to_string(preprocessed_image_pil, config=custom_config)

        # Clean the extracted text
        text = clean_text(text)

        context = {
            'uploaded_file_url': uploaded_file_url,
            'text': text,
        }
        return render(request, 'layout/index.html', context)
    return render(request, 'layout/index.html')

import os
from django.shortcuts import render
from django.core.files.storage import FileSystemStorage
import pytesseract
import cv2
from PIL import Image
import numpy as np
import re

# Set Tesseract command
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

def clean_text(text):
    # Example of cleaning common OCR errors
    text = re.sub(r's+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^ws,.!?;:()"']', '', text)  # Keep common punctuation
    return text.strip()

def home(request):
    return render(request, 'layout/index.html')

def preprocess_image(image_path):
    # Read the image
    image = cv2.imread(image_path, cv2.IMREAD_COLOR)
    if image is None:
        raise ValueError(f"Error loading image: {image_path}")
    
    # Use the red channel to improve contrast
    red_channel = image[:, :, 2]

    # Convert to grayscale (using red channel)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Manually stretch the histogram to improve contrast
    # Histogram stretching
    min_val = np.min(gray)
    max_val = np.max(gray)
    stretched_gray = ((gray - min_val) / (max_val - min_val) * 255).astype(np.uint8)
    
    # If text is still blurred, apply unsharp masking
    blurred = cv2.GaussianBlur(stretched_gray, (9, 9), 10.0)
    unsharp_image = cv2.addWeighted(stretched_gray, 1.5, blurred, -0.5, 0)
    
    # Apply adaptive thresholding
    binary = cv2.adaptiveThreshold(unsharp_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    
    # Remove noise using median filter
    denoised = cv2.medianBlur(binary, 3)
    
    return denoised

def upload(request):
    if request.method == 'POST' and request.FILES.get('document'):
        document = request.FILES['document']
        fs = FileSystemStorage()
        filename = fs.save(document.name, document)
        uploaded_file_url = fs.url(filename)

        # Preprocess the uploaded document
        try:
            preprocessed_image = preprocess_image(fs.path(filename))
        except ValueError as e:
            return render(request, 'layout/index.html', {'error': str(e)})

        # Convert the processed image to PIL format
        preprocessed_image_pil = Image.fromarray(preprocessed_image)

        # Extract text using Tesseract
        custom_config = r'--oem 3 --psm 12 --dpi 1500'
        text = pytesseract.image_to_string(preprocessed_image_pil, config=custom_config)

        # Clean the extracted text
        text = clean_text(text)

        context = {
            'uploaded_file_url': uploaded_file_url,
            'text': text,
        }
        return render(request, 'layout/index.html', context)
    return render(request, 'layout/index.html')


<!DOCTYPE html>
<html>
<head>
    <title>Document Layout Detection</title>
</head>
<body>
    <h1>Upload Document</h1>
    <form method="post" enctype="multipart/form-data" action="{% url 'upload' %}">
        {% csrf_token %}
        <input type="file" name="document">
        <button type="submit">Upload</button>
    </form>

    {% if uploaded_file_url %}
        <h2>Uploaded Document:</h2>
        <img src="{{ uploaded_file_url }}" alt="Document">
        <h2>Extracted Text:</h2>
        <pre>{{ text }}</pre>
    {% endif %}
</body>
</html>  
# Django URL configuration
from django.urls import path
from . import views

urlpatterns = [
    path('', views.home, name='home'),
    path('upload/', views.upload, name='upload'),
]

Thiết kế website giá rẻ

Danh mục

Issue with Extracting Text from Images Using Django and Tesseract OCR