Problem faced in: While extracting Job title from image text, i am facing problems.
Approaches used for Job Title: For Job title, I have tried the following approaches:
- fuzzy matching only
- Fuzzy matching with TF-IDF vectorizer and cosine similarity.
Problem faced in approaches:
For job title:
- While using fuzzy matching only the designations detected are
partially correct. For eg: If the job title is “General manager”,
then the job title detected is “Manager”. - for second approach of
job title, the problem faced was results were NA for most of the
cases. For company Name: - While using
xlm-roberta-large-finetuned-conll03-english LLM (from hugging face)
and displaying the result with the maximum probability score, for
some cases, location names were returned if company name is not
detected. Company Name was also not detected if it didn’t adhere to
conventional norms. For eg: in case of “Dy. Commissioner of Income
Tax”, Income Tax is the name of the company. But the name was not
detected. - Comparing predictions of spacy and xlm-roberta on the
basis of max length: No output was shown by spacy and only output of
xlm-roberta was returned.
Due to unavoidable reasons, i wouldn’t be able to share code. Sharing functions for job title. Please suggest any possible improvements or additional logic that can be incorporated in the code so that it works. I am currently stuck at this point and any insight would be really helpful.
job title – 1. fuzzy matching only
import pandas as pd
from fuzzywuzzy import fuzz, process
def job_title(text_extracted_from_img):
designation = re.findall(r"^[\w'-,.][^0-9_!¡?÷?¿/\+=@#$%ˆ&*(){}|~<>;:[]]{2,}", text_extracted_from_img, flags=re.IGNORECASE)
designation = "".join(designation)
# 2019 job title dataset was downloaded from online source, all title related columns were named
df = pd.read_csv("./pdl_related_title_dataset/2019_free_title_data_copy.csv")
cols = ["title", "related titles 9", "related titles 10","related titles 11","related titles 12","related titles 13","related titles 14","related titles 15","related titles 16", "related titles 17", "related titles 18"]
df = pd.DataFrame(df, columns=cols)
def find_best_match(x):
best_match = None
best_score = 0
for column in df.columns:
match = process.extractOne(x, df[column].unique(), scorer=fuzz.token_set_ratio, score_cutoff=90)
if match and match[1] > best_score:
best_match = match[0]
best_score = match[1]
return best_match
designation = find_best_match(text_extracted_from_img)
print("Job Title:")
if designation:
print(designation)
else:
print("")
return designation
job title – 1. fuzzy matching with TF-IDF vectorizer and cosine similarity
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz, process
import re
import itertools
from typing import Union, List, Tuple
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import time
import os
from io import StringIO
def preprocess(s):
s = s.replace(r'[.,"'-?:!;]', ' ', regex=True)
return s
def findFuzz(index, row, match, indices):
rowm = process.extract(
row, match,
scorer = fuzz.token_sort_ratio,
limit = 1
)
result = [(ABC[index], CLEAN[indices[index][0]], rowm[0][1])]
return result
def fuzzytf(abc, clean):
ngram_range = (1, 3)
n_neighbors = 1
analyzer = 'char'
limit = 1
clean_data = preprocess(clean)
input_data = preprocess(abc)
vectorizer = TfidfVectorizer(analyzer = analyzer, ngram_range = ngram_range)
X = vectorizer.fit_transform(clean_data.values.astype('U'))
nbrs = NearestNeighbors(n_neighbors = n_neighbors, metric = 'cosine').fit(X)
input_vec = vectorizer.transform(input_data)
distances, indices = nbrs.kneighbors(input_vec, n_neighbors = n_neighbors)
nearest_values = np.array(clean_data)[indices]
results = [findFuzz(i, row, nearest_values[i], indices) for i, row in enumerate(input_data)]
df1 = pd.DataFrame(itertools.chain.from_iterable(results),
columns = ['Input', 'Output', 'Ratio'])
return df1
def job_title(text_extracted_from_img):
# 2019 job title dataset was downloaded from online source
data = pd.read_csv('./pdl_related_title_dataset/2019_free_title_data.csv', usecols = ['title'])
data = pd.DataFrame(data)
df = data.to_string()
master = data.filter(['title'], axis = 1)
global CLEAN, ABC
CLEAN = master['title']
# Write the extracted text to a temporary text file
with open("temp_file.txt", "w") as text_file:
text_file.write(text_extracted_from_img)
# Read the content from the temporary text file into a DataFrame
content = pd.read_csv("temp_file.txt", sep='delimiter', header=None, engine='python')
ABC = content.squeeze()
start = time.time()
end = time.time()
des= fuzzytf(ABC, CLEAN)
result.loc[(result["Ratio"] > 95),"found"]=True
result.loc[(result["Ratio"] < 95),"found"]=False
final_result = (result.loc[result["found"]==True])
if(final_result['Input'].empty):
final_result['Input'] = "NA" # Fixing the assignment operator here
des= "NA"
return des
else:
pos = final_result["Input"]
des= (pos.to_string(index = False))
return des
muskan rath is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.