I am developing a Telegram bot using Python and facing challenges with keyword recognition and extraction. Despite trying various methods like basic string matching, regular expressions, and NLP libraries (spaCy, NLTK), I am unable to achieve consistent keyword recognition and efficient performance with larger datasets.
I have implemented functions for normalizing text, fuzzy matching, and handling messages. However, the recognition remains inconsistent, and the performance drops significantly as the dataset grows. I expect to improve the accuracy of keyword recognition and maintain efficient performance.
Expected: I expected the bot to consistently recognize keywords and respond with appropriate buttons.
Result: The recognition remains inconsistent, and the performance drops significantly as the dataset grows.
Questions:
- What are the best practices for keyword recognition in Telegram bots?
- Are there any optimized libraries or tools for this purpose?
- Could someone provide a working example or point to relevant resources?
import re # Importing the module for working with regular expressions
import difflib # Importing the module for sequence comparison
import spacy # Importing the spaCy library for natural language processing
import nltk # Importing the NLTK library for natural language processing
from fuzzywuzzy import fuzz # Importing the module for fuzzy string matching
from nltk.corpus import stopwords # Importing the list of stop words from NLTK
from nltk.stem import WordNetLemmatizer # Importing the lemmatizer from NLTK
from telebot import types # Importing the types module from the telebot library for creating buttons
import config # Importing the config file containing settings
import time # Importing the module for working with time
from button_functions import send_buttons # Importing the function for sending buttons
from buttons import ( # Importing functions for sending various buttons
send_calculators_buttons,
send_loan_buttons,
send_sub_buttons_hipotek,
send_sub_buttons_grav,
send_sub_buttons_acra,
)
import logging # Importing the module for logging
from config import button_phrases # Importing button phrases from config
# Loading NLTK resources
nltk.download('punkt') # Downloading the tokenizer from NLTK
nltk.download('stopwords') # Downloading the stop words from NLTK
nltk.download('wordnet') # Downloading the WordNet lemmatizer from NLTK
# Loading spaCy models
nlp_en = spacy.load("en_core_web_sm") # Loading the model for English
nlp_ru = spacy.load("ru_core_news_sm") # Loading the model for Russian
# nlp_hy = spacy.load("hy_core_news_sm") # Loading the model for Armenian
# Initializing the lemmatizer and the list of stop words
lemmatizer = WordNetLemmatizer() # Initializing the lemmatizer
stop_words = set(stopwords.words('english')).union(set(stopwords.words('russian'))).union(
set(config.conjunctions)) # Combining the lists of stop words for English, Russian, and conjunctions
def normalize_text(text, lang='en'): # Defining the function for normalizing text
text = re.sub(r'[^ws]', '', text) # Removing all characters except letters and spaces using a regular expression
logging.info(f"Text after regex cleaning: {text}") # Logging the text after removing characters
# Removing conjunctions from the text
words = text.split() # Splitting the text into individual words
words = [word for word in words if word not in config.conjunctions] # Removing all conjunctions from the text using the list of conjunctions from the config file
text = ' '.join(words) # Joining the remaining words back into a text string
logging.info(f"Text after removing conjunctions: {text}") # Logging the text after removing conjunctions
if lang == 'ru': # If the language of the text is Russian
doc = nlp_ru(text) # Processing the text with the Russian language model
elif lang == 'hy': # If the language of the text is Armenian
doc = text.split() # Simple example of text processing for Armenian
else: # If the language of the text is English
doc = nlp_en(text) # Processing the text with the English language model
if lang == 'hy': # If the language of the text is Armenian
normalized_tokens = [token.lower() for token in doc if not re.match(r'W', token)] # Normalizing the text for Armenian by removing all non-alphabetic characters
else: # If the language of the text is English or Russian
normalized_tokens = [token.lemma_.lower() for token in doc if not token.is_punct and not token.is_space] # Normalizing the text for English and Russian using lemmatization
normalized_text = " ".join(normalized_tokens) # Joining the normalized tokens into a string
logging.info(f"Normalized text: {normalized_text}") # Logging the normalized text
return normalized_text # Returning the normalized text
def fuzzy_match(input_text, phrases): # Defining the function for fuzzy matching
best_match = None # Initializing the variable for the best match
highest_ratio = 0 # Initializing the variable for the highest ratio
normalized_input = normalize_text(input_text) # Normalizing the input text
for phrase in phrases: # For each phrase in the list of phrases
normalized_phrase = normalize_text(phrase) # Normalizing the phrase
ratio = fuzz.ratio(normalized_input, normalized_phrase) # Comparing the text and the phrase
if ratio > highest_ratio: # If the current ratio is higher than the highest ratio
highest_ratio = ratio # Updating the highest ratio
best_match = phrase # Updating the best phrase
logging.info(f"Fuzzy match result: {best_match} with ratio: {highest_ratio}") # Logging the result of the fuzzy match
return best_match, highest_ratio # Returning the best phrase and the highest ratio
def get_best_match(input_text, phrases): # Defining the function for finding the best match
input_text = normalize_text(input_text) # Normalizing the input text
normalized_phrases = [normalize_text(phrase) for phrase in phrases] # Normalizing the phrases
matches = difflib.get_close_matches(input_text, normalized_phrases, n=1, cutoff=0.8) # Finding the closest matches
if matches: # If matches are found
best_match_index = normalized_phrases.index(matches[0]) # Getting the index of the best match
best_match = phrases[best_match_index] # Getting the best phrase
ratio = difflib.SequenceMatcher(None, input_text, matches[0]).ratio() * 100 # Calculating the match percentage
logging.info(f"Best match found: {best_match} with ratio: {ratio}") # Logging the result of the match
return best_match, ratio # Returning the best phrase and the match percentage
logging.info("No match found") # Logging the absence of matches
return None, 0 # Returning None and 0 if no matches are found
def find_best_match(input_text, config_section): # Defining the function for finding the best match with the configuration
best_match, highest_ratio = fuzzy_match(input_text, config_section) # Performing fuzzy matching
if highest_ratio < 80: # If the highest ratio is less than 80
best_match, highest_ratio = get_best_match(input_text, config_section) # Performing closest match search
return best_match, highest_ratio # Returning the best phrase and the highest ratio
def get_matching_keywords(message, threshold=81): # Defining the function for getting matching keywords
text = message.text.lower() # Converting the message text to lowercase
logging.info(f"get_matching_keywords called with text: {text}") # Logging the function call with text
best_match = None # Initializing the variable for the best match
highest_ratio = 0 # Initializing the variable for the highest ratio
# Removing conjunctions
text = ' '.join([word for word in text.split() if word not in config.conjunctions]) # Removing conjunctions from the text
logging.info(f"Text after removing conjunctions: {text}") # Logging the text after removing conjunctions
# Checking for matches with button phrases
for button, phrases in config.button_phrases.items(): # For each button and phrases in the dictionary of button phrases
for phrase in phrases: # For each phrase in the list of phrases
best_match_phrase, highest_ratio_phrase = fuzzy_match(text, [
phrase.lower()]) # Performing fuzzy matching with button phrases
if highest_ratio_phrase > highest_ratio: # If the current ratio is higher than the highest ratio
best_match = button # Updating the best button
highest_ratio = highest_ratio_phrase # Updating the highest ratio
# Checking for matches with keywords
for key, words in config.keywords.items(): # For each key and words in the dictionary of keywords
for word in words: # For each word in the list of words
best_match_word, highest_ratio_word = fuzzy_match(text, [
word.lower()]) # Performing fuzzy matching with keywords
if highest_ratio_word > highest_ratio: # If the current ratio is higher than the highest ratio
best_match = key # Updating the best key
highest_ratio = highest_ratio_word # Updating the highest ratio
if highest_ratio >= threshold: # If the highest ratio is greater than or equal to the threshold
logging.info(
f"Best matched keyword: {best_match} with ratio: {highest_ratio}") # Logging the best matching keyword and the ratio
return [best_match] # Returning the list with the best keyword
else: # If the highest ratio is less than the threshold
logging.info(
f"No exact match found, highest ratio: {highest_ratio}") # Logging the absence of an exact match and the highest ratio
return [] # Returning an empty list
def remove_fuzzy_matches(text, phrases_dict, threshold=74): # Defining the function for removing fuzzy matches
words = text.split() # Splitting the text into words
removed_phrases = [] # Initializing the list of removed phrases
for key, phrases in phrases_dict.items(): # For each key and phrases in the dictionary of phrases
for phrase in phrases: # For each phrase in the list of phrases
phrase_words = phrase.split() # Splitting the phrase into words
phrase_len = len(phrase_words) # Determining the length of the phrase
indices_to_remove = set() # Initializing the set of indices to remove
for i in range(len(words) - phrase_len + 1): # Iterating through the words in the text
window = words[i:i + phrase_len] # Defining the window of words
window_text = ' '.join(window) # Joining the window words into a string
if fuzz.ratio(window_text, phrase) >= threshold: # If the match ratio is greater than or equal to the threshold
indices_to_remove.update(range(i, i + phrase_len)) # Updating the indices to remove
removed_phrases.append(key) # Adding the key to the list of removed phrases
logging.info(f'Removed phrase: {phrase} (matched key: {key})') # Logging the removed phrase and key
break # Breaking the loop after finding a match
if indices_to_remove: # If there are indices to remove
words = [word for i, word in enumerate(words) if i not in indices_to_remove] # Removing the matched phrases from the text
break # Breaking the outer loop after finding a match
cleaned_text = ' '.join(words) # Joining the remaining words into a string
logging.info(f"Cleaned text after removal: {cleaned_text}") # Logging the cleaned text
return removed_phrases, cleaned_text # Returning the list of removed phrases and the cleaned text
def handle_message(bot, message): # Defining the function for handling messages
try: # Starting the try block for exception handling
if not hasattr(message, 'text') or not message.text: # Checking if the message has text
bot.send_message(message.chat.id, "Message does not contain text", parse_mode='Markdown') # Sending a message about the absence of text
return # Breaking the function execution
text = message.text.lower() # Converting the message text to lowercase
logging.info(f"Received message: {text}") # Logging the received message
# Removing conjunctions before processing the text
text = ' '.join([word for word in text.split() if word not in config.conjunctions]) # Removing conjunctions from the text
logging.info(f"Text after removing conjunctions: {text}") # Logging the text after removing conjunctions
btn_phr_removed = remove_fuzzy_matches(text, button_phrases) # Removing matched phrases for buttons from the text
keywords = btn_phr_removed[0] # Getting the list of keywords
logging.info(f"Keywords after button phrase removal: {keywords}") # Logging the keywords after removing button phrases
new_text = btn_phr_removed[1] # Getting the cleaned text
keyword_matches = remove_fuzzy_matches(new_text, config.keywords) # Removing matched keywords from the text
keywords.extend(keyword_matches[0]) # Adding the matched keywords to the list of keywords
logging.info(f'Final keywords: {keywords}') # Logging the final keywords
if keywords: # If keywords are found
send_buttons(bot, message, keywords) # Sending buttons
else: # If no keywords are found
bot.send_message(message.chat.id, "No relevant materials found with the entered content", parse_mode='Markdown') # Sending a message about the absence of found materials
except Exception as e: # Catching any exception that may occur
logging.error(f"Error handling message: {e}") # Logging the error
bot.send_message(message.chat.id, "An error has occurred", parse_mode='Markdown') # Sending a message about the error
def open_site(bot, message): # Defining the function for sending the site link
bot.send_message(message.chat.id, 'Visit the ABC Finance website: <a href="https://www.abcfinance.am/">ABC Finance</a>', parse_mode='HTML') # Sending a message with the site link
def open_loan_calculator(bot, message): # Defining the function for sending the loan calculator link
bot.send_message(message.chat.id, 'Use the Loan Calculator: <a href="https://www.abcfinance.am/calculators/loancalc.html">Loan Calculator</a>', parse_mode='HTML') # Sending a message with the loan calculator link
def open_deposit_calculator(bot, message): # Defining the function for sending the deposit calculator link
bot.send_message(message.chat.id, 'Use the Deposit Calculator: <a href="https://www.abcfinance.am/calculators/depositcalc.html">Deposit Calculator</a>', parse_mode='HTML') # Sending a message with the deposit calculator link
def open_pension_calculator(bot, message): # Defining the function for sending the pension calculator link
bot.send_message(message.chat.id, 'Use the Pension Calculator: <a href="https://www.abcfinance.am/calculators/pensioncalc.html">Pension Calculator</a>', parse_mode='HTML') # Sending a message with the pension calculator link
def open_salary_calculator(bot, message): # Defining the function for sending the salary calculator link
bot.send_message(message.chat.id, 'Use the Salary Calculator: <a href="https://www.abcfinance.am/calculators/salarycalc.html">Salary Calculator</a>', parse_mode='HTML') # Sending a message with the salary calculator link
def send_start_message(bot, message): # Defining the function for sending the welcome message
bot.send_message(message.chat.id, f'Hello, {message.from_user.first_name}!') # Sending a welcome message with the user's name
time.sleep(2) # Pause for 2 seconds
bot.send_message(message.chat.id, 'Choose the <b>“Financial Education”</b> section, then the desired topic, and you will receive the necessary information', parse_mode='HTML') # Sending a message with instructions for choosing a topic
time.sleep(2) # Pause for 2 seconds
bot.send_message(message.chat.id, 'By taking the <b>“Financial Test”</b>, you will check your financial knowledge and get ways to improve it', parse_mode='HTML') # Sending a message with instructions for taking the financial test
time.sleep(2) # Pause for 2 seconds
bot.send_message(message.chat.id, 'To use the calculators, click on the blue <b>“Menu”</b> button on the left', parse_mode='HTML') # Sending a message with instructions for using the calculators
time.sleep(2) # Pause for 2 seconds
markup = types.ReplyKeyboardMarkup(row_width=2, resize_keyboard=True) # Creating an object of the keyboard with buttons
itembtn1 = types.KeyboardButton('Financial Education') # Creating the "Financial Education" button
itembtn2 = types.KeyboardButton('Financial Test') # Creating the "Financial Test" button
markup.add(itembtn1, itembtn2) # Adding buttons to the keyboard
bot.send_message(message.chat.id, 'You can enter the words <b>“menu”</b> or press the <b>“Main Menu”</b> button to return to the main menu', parse_mode='HTML', reply_markup=markup) # Sending a message with instructions and the keyboard
def send_main_menu_buttons(bot, message): # Defining the function for sending the main menu
markup = types.ReplyKeyboardMarkup(row_width=2, resize_keyboard=True) # Creating an object of the keyboard with buttons
itembtn1 = types.KeyboardButton('Financial Education') # Creating the "Financial Education" button
itembtn2 = types.KeyboardButton('Financial Test') # Creating the "Financial Test" button
markup.add(itembtn1, itembtn2) # Adding buttons to the keyboard
bot.send_message(message.chat.id, "Choose the appropriate section", reply_markup=markup) # Sending a message with the keyboard
def send_finlearn_buttons(bot, message): # Defining the function for sending the financial education buttons
markup = types.ReplyKeyboardMarkup(row_width=4, resize_keyboard=True) # Creating an object of the keyboard with buttons
loan_btn = types.KeyboardButton('Loan') # Creating the "Loan" button
main_menu_btn = types.KeyboardButton('Main Menu') # Creating the "Main Menu" button
markup.row(loan_btn) # Adding the button to the keyboard
markup.row(main_menu_btn) # Adding the button to the keyboard
bot.send_message(message.chat.id, "Let's learn together", reply_markup=markup) # Sending a message with the keyboard
1