I am working on a Django project where I need to extract text from images using Tesseract OCR. The images are often blurred and low in contrast, and contain various text blocks such as headings, dates, small letters, bold letters, and sometimes even blurred letters. The image dimensions are approximately 256 x 350 pixels with a resolution of 96 DPI.
import os
from django.shortcuts import render
from django.core.files.storage import FileSystemStorage
import pytesseract
import cv2
from PIL import Image
import numpy as np
import re
# Set Tesseract command
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
def clean_text(text):
# Example of cleaning common OCR errors
text = re.sub(r's+', ' ', text) # Remove extra whitespace
text = re.sub(r'[^ws,.!?;:()"']', '', text) # Keep common punctuation
return text.strip()
def home(request):
return render(request, 'layout/index.html')
def preprocess_image(image_path):
# Read the image
image = cv2.imread(image_path, cv2.IMREAD_COLOR)
if image is None:
raise ValueError(f"Error loading image: {image_path}")
# Use the red channel to improve contrast
red_channel = image[:, :, 2]
# Convert to grayscale (using red channel)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Manually stretch the histogram to improve contrast
# Histogram stretching
min_val = np.min(gray)
max_val = np.max(gray)
stretched_gray = ((gray - min_val) / (max_val - min_val) * 255).astype(np.uint8)
# If text is still blurred, apply unsharp masking
blurred = cv2.GaussianBlur(stretched_gray, (9, 9), 10.0)
unsharp_image = cv2.addWeighted(stretched_gray, 1.5, blurred, -0.5, 0)
# Apply adaptive thresholding
binary = cv2.adaptiveThreshold(unsharp_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
# Remove noise using median filter
denoised = cv2.medianBlur(binary, 3)
return denoised
def upload(request):
if request.method == 'POST' and request.FILES.get('document'):
document = request.FILES['document']
fs = FileSystemStorage()
filename = fs.save(document.name, document)
uploaded_file_url = fs.url(filename)
# Preprocess the uploaded document
try:
preprocessed_image = preprocess_image(fs.path(filename))
except ValueError as e:
return render(request, 'layout/index.html', {'error': str(e)})
# Convert the processed image to PIL format
preprocessed_image_pil = Image.fromarray(preprocessed_image)
# Extract text using Tesseract
custom_config = r'--oem 3 --psm 12 --dpi 1500'
text = pytesseract.image_to_string(preprocessed_image_pil, config=custom_config)
# Clean the extracted text
text = clean_text(text)
context = {
'uploaded_file_url': uploaded_file_url,
'text': text,
}
return render(request, 'layout/index.html', context)
return render(request, 'layout/index.html')
import os
from django.shortcuts import render
from django.core.files.storage import FileSystemStorage
import pytesseract
import cv2
from PIL import Image
import numpy as np
import re
# Set Tesseract command
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
def clean_text(text):
# Example of cleaning common OCR errors
text = re.sub(r's+', ' ', text) # Remove extra whitespace
text = re.sub(r'[^ws,.!?;:()"']', '', text) # Keep common punctuation
return text.strip()
def home(request):
return render(request, 'layout/index.html')
def preprocess_image(image_path):
# Read the image
image = cv2.imread(image_path, cv2.IMREAD_COLOR)
if image is None:
raise ValueError(f"Error loading image: {image_path}")
# Use the red channel to improve contrast
red_channel = image[:, :, 2]
# Convert to grayscale (using red channel)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Manually stretch the histogram to improve contrast
# Histogram stretching
min_val = np.min(gray)
max_val = np.max(gray)
stretched_gray = ((gray - min_val) / (max_val - min_val) * 255).astype(np.uint8)
# If text is still blurred, apply unsharp masking
blurred = cv2.GaussianBlur(stretched_gray, (9, 9), 10.0)
unsharp_image = cv2.addWeighted(stretched_gray, 1.5, blurred, -0.5, 0)
# Apply adaptive thresholding
binary = cv2.adaptiveThreshold(unsharp_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
# Remove noise using median filter
denoised = cv2.medianBlur(binary, 3)
return denoised
def upload(request):
if request.method == 'POST' and request.FILES.get('document'):
document = request.FILES['document']
fs = FileSystemStorage()
filename = fs.save(document.name, document)
uploaded_file_url = fs.url(filename)
# Preprocess the uploaded document
try:
preprocessed_image = preprocess_image(fs.path(filename))
except ValueError as e:
return render(request, 'layout/index.html', {'error': str(e)})
# Convert the processed image to PIL format
preprocessed_image_pil = Image.fromarray(preprocessed_image)
# Extract text using Tesseract
custom_config = r'--oem 3 --psm 12 --dpi 1500'
text = pytesseract.image_to_string(preprocessed_image_pil, config=custom_config)
# Clean the extracted text
text = clean_text(text)
context = {
'uploaded_file_url': uploaded_file_url,
'text': text,
}
return render(request, 'layout/index.html', context)
return render(request, 'layout/index.html')
import os
from django.shortcuts import render
from django.core.files.storage import FileSystemStorage
import pytesseract
import cv2
from PIL import Image
import numpy as np
import re
# Set Tesseract command
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
def clean_text(text):
# Example of cleaning common OCR errors
text = re.sub(r's+', ' ', text) # Remove extra whitespace
text = re.sub(r'[^ws,.!?;:()"']', '', text) # Keep common punctuation
return text.strip()
def home(request):
return render(request, 'layout/index.html')
def preprocess_image(image_path):
# Read the image
image = cv2.imread(image_path, cv2.IMREAD_COLOR)
if image is None:
raise ValueError(f"Error loading image: {image_path}")
# Use the red channel to improve contrast
red_channel = image[:, :, 2]
# Convert to grayscale (using red channel)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Manually stretch the histogram to improve contrast
# Histogram stretching
min_val = np.min(gray)
max_val = np.max(gray)
stretched_gray = ((gray - min_val) / (max_val - min_val) * 255).astype(np.uint8)
# If text is still blurred, apply unsharp masking
blurred = cv2.GaussianBlur(stretched_gray, (9, 9), 10.0)
unsharp_image = cv2.addWeighted(stretched_gray, 1.5, blurred, -0.5, 0)
# Apply adaptive thresholding
binary = cv2.adaptiveThreshold(unsharp_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
# Remove noise using median filter
denoised = cv2.medianBlur(binary, 3)
return denoised
def upload(request):
if request.method == 'POST' and request.FILES.get('document'):
document = request.FILES['document']
fs = FileSystemStorage()
filename = fs.save(document.name, document)
uploaded_file_url = fs.url(filename)
# Preprocess the uploaded document
try:
preprocessed_image = preprocess_image(fs.path(filename))
except ValueError as e:
return render(request, 'layout/index.html', {'error': str(e)})
# Convert the processed image to PIL format
preprocessed_image_pil = Image.fromarray(preprocessed_image)
# Extract text using Tesseract
custom_config = r'--oem 3 --psm 12 --dpi 1500'
text = pytesseract.image_to_string(preprocessed_image_pil, config=custom_config)
# Clean the extracted text
text = clean_text(text)
context = {
'uploaded_file_url': uploaded_file_url,
'text': text,
}
return render(request, 'layout/index.html', context)
return render(request, 'layout/index.html')
<!DOCTYPE html>
<html>
<head>
<title>Document Layout Detection</title>
</head>
<body>
<h1>Upload Document</h1>
<form method="post" enctype="multipart/form-data" action="{% url 'upload' %}">
{% csrf_token %}
<input type="file" name="document">
<button type="submit">Upload</button>
</form>
{% if uploaded_file_url %}
<h2>Uploaded Document:</h2>
<img src="{{ uploaded_file_url }}" alt="Document">
<h2>Extracted Text:</h2>
<pre>{{ text }}</pre>
{% endif %}
</body>
</html>
# Django URL configuration
from django.urls import path
from . import views
urlpatterns = [
path('', views.home, name='home'),
path('upload/', views.upload, name='upload'),
]
New contributor
Vishal Upadhyay is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.