Improving keyword recognition and extraction in a Telegram bot [closed]

I am developing a Telegram bot using Python and facing challenges with keyword recognition and extraction. Despite trying various methods like basic string matching, regular expressions, and NLP libraries (spaCy, NLTK), I am unable to achieve consistent keyword recognition and efficient performance with larger datasets.

I have implemented functions for normalizing text, fuzzy matching, and handling messages. However, the recognition remains inconsistent, and the performance drops significantly as the dataset grows. I expect to improve the accuracy of keyword recognition and maintain efficient performance.

Expected: I expected the bot to consistently recognize keywords and respond with appropriate buttons.

Result: The recognition remains inconsistent, and the performance drops significantly as the dataset grows.

Questions:

  • What are the best practices for keyword recognition in Telegram bots?
  • Are there any optimized libraries or tools for this purpose?
  • Could someone provide a working example or point to relevant resources?
import re  # Importing the module for working with regular expressions
import difflib  # Importing the module for sequence comparison
import spacy  # Importing the spaCy library for natural language processing
import nltk  # Importing the NLTK library for natural language processing
from fuzzywuzzy import fuzz  # Importing the module for fuzzy string matching
from nltk.corpus import stopwords  # Importing the list of stop words from NLTK
from nltk.stem import WordNetLemmatizer  # Importing the lemmatizer from NLTK
from telebot import types  # Importing the types module from the telebot library for creating buttons
import config  # Importing the config file containing settings
import time  # Importing the module for working with time
from button_functions import send_buttons  # Importing the function for sending buttons
from buttons import (  # Importing functions for sending various buttons
    send_calculators_buttons,
    send_loan_buttons,
    send_sub_buttons_hipotek,
    send_sub_buttons_grav,
    send_sub_buttons_acra,
)
import logging  # Importing the module for logging
from config import button_phrases  # Importing button phrases from config

# Loading NLTK resources
nltk.download('punkt')  # Downloading the tokenizer from NLTK
nltk.download('stopwords')  # Downloading the stop words from NLTK
nltk.download('wordnet')  # Downloading the WordNet lemmatizer from NLTK

# Loading spaCy models
nlp_en = spacy.load("en_core_web_sm")  # Loading the model for English
nlp_ru = spacy.load("ru_core_news_sm")  # Loading the model for Russian
# nlp_hy = spacy.load("hy_core_news_sm")  # Loading the model for Armenian

# Initializing the lemmatizer and the list of stop words
lemmatizer = WordNetLemmatizer()  # Initializing the lemmatizer
stop_words = set(stopwords.words('english')).union(set(stopwords.words('russian'))).union(
    set(config.conjunctions))  # Combining the lists of stop words for English, Russian, and conjunctions

def normalize_text(text, lang='en'):  # Defining the function for normalizing text
    text = re.sub(r'[^ws]', '', text)  # Removing all characters except letters and spaces using a regular expression
    logging.info(f"Text after regex cleaning: {text}")  # Logging the text after removing characters

    # Removing conjunctions from the text
    words = text.split()  # Splitting the text into individual words
    words = [word for word in words if word not in config.conjunctions]  # Removing all conjunctions from the text using the list of conjunctions from the config file
    text = ' '.join(words)  # Joining the remaining words back into a text string

    logging.info(f"Text after removing conjunctions: {text}")  # Logging the text after removing conjunctions

    if lang == 'ru':  # If the language of the text is Russian
        doc = nlp_ru(text)  # Processing the text with the Russian language model
    elif lang == 'hy':  # If the language of the text is Armenian
        doc = text.split()  # Simple example of text processing for Armenian
    else:  # If the language of the text is English
        doc = nlp_en(text)  # Processing the text with the English language model

    if lang == 'hy':  # If the language of the text is Armenian
        normalized_tokens = [token.lower() for token in doc if not re.match(r'W', token)]  # Normalizing the text for Armenian by removing all non-alphabetic characters
    else:  # If the language of the text is English or Russian
        normalized_tokens = [token.lemma_.lower() for token in doc if not token.is_punct and not token.is_space]  # Normalizing the text for English and Russian using lemmatization

    normalized_text = " ".join(normalized_tokens)  # Joining the normalized tokens into a string
    logging.info(f"Normalized text: {normalized_text}")  # Logging the normalized text
    return normalized_text  # Returning the normalized text

def fuzzy_match(input_text, phrases):  # Defining the function for fuzzy matching
    best_match = None  # Initializing the variable for the best match
    highest_ratio = 0  # Initializing the variable for the highest ratio
    normalized_input = normalize_text(input_text)  # Normalizing the input text

    for phrase in phrases:  # For each phrase in the list of phrases
        normalized_phrase = normalize_text(phrase)  # Normalizing the phrase
        ratio = fuzz.ratio(normalized_input, normalized_phrase)  # Comparing the text and the phrase
        if ratio > highest_ratio:  # If the current ratio is higher than the highest ratio
            highest_ratio = ratio  # Updating the highest ratio
            best_match = phrase  # Updating the best phrase

    logging.info(f"Fuzzy match result: {best_match} with ratio: {highest_ratio}")  # Logging the result of the fuzzy match
    return best_match, highest_ratio  # Returning the best phrase and the highest ratio

def get_best_match(input_text, phrases):  # Defining the function for finding the best match
    input_text = normalize_text(input_text)  # Normalizing the input text
    normalized_phrases = [normalize_text(phrase) for phrase in phrases]  # Normalizing the phrases

    matches = difflib.get_close_matches(input_text, normalized_phrases, n=1, cutoff=0.8)  # Finding the closest matches
    if matches:  # If matches are found
        best_match_index = normalized_phrases.index(matches[0])  # Getting the index of the best match
        best_match = phrases[best_match_index]  # Getting the best phrase
        ratio = difflib.SequenceMatcher(None, input_text, matches[0]).ratio() * 100  # Calculating the match percentage
        logging.info(f"Best match found: {best_match} with ratio: {ratio}")  # Logging the result of the match
        return best_match, ratio  # Returning the best phrase and the match percentage
    logging.info("No match found")  # Logging the absence of matches
    return None, 0  # Returning None and 0 if no matches are found

def find_best_match(input_text, config_section):  # Defining the function for finding the best match with the configuration
    best_match, highest_ratio = fuzzy_match(input_text, config_section)  # Performing fuzzy matching
    if highest_ratio < 80:  # If the highest ratio is less than 80
        best_match, highest_ratio = get_best_match(input_text, config_section)  # Performing closest match search
    return best_match, highest_ratio  # Returning the best phrase and the highest ratio

def get_matching_keywords(message, threshold=81):  # Defining the function for getting matching keywords
    text = message.text.lower()  # Converting the message text to lowercase
    logging.info(f"get_matching_keywords called with text: {text}")  # Logging the function call with text

    best_match = None  # Initializing the variable for the best match
    highest_ratio = 0  # Initializing the variable for the highest ratio

    # Removing conjunctions
    text = ' '.join([word for word in text.split() if word not in config.conjunctions])  # Removing conjunctions from the text
    logging.info(f"Text after removing conjunctions: {text}")  # Logging the text after removing conjunctions

    # Checking for matches with button phrases
    for button, phrases in config.button_phrases.items():  # For each button and phrases in the dictionary of button phrases
        for phrase in phrases:  # For each phrase in the list of phrases
            best_match_phrase, highest_ratio_phrase = fuzzy_match(text, [
                phrase.lower()])  # Performing fuzzy matching with button phrases
            if highest_ratio_phrase > highest_ratio:  # If the current ratio is higher than the highest ratio
                best_match = button  # Updating the best button
                highest_ratio = highest_ratio_phrase  # Updating the highest ratio

    # Checking for matches with keywords
    for key, words in config.keywords.items():  # For each key and words in the dictionary of keywords
        for word in words:  # For each word in the list of words
            best_match_word, highest_ratio_word = fuzzy_match(text, [
                word.lower()])  # Performing fuzzy matching with keywords
            if highest_ratio_word > highest_ratio:  # If the current ratio is higher than the highest ratio
                best_match = key  # Updating the best key
                highest_ratio = highest_ratio_word  # Updating the highest ratio

    if highest_ratio >= threshold:  # If the highest ratio is greater than or equal to the threshold
        logging.info(
            f"Best matched keyword: {best_match} with ratio: {highest_ratio}")  # Logging the best matching keyword and the ratio
        return [best_match]  # Returning the list with the best keyword
    else:  # If the highest ratio is less than the threshold
        logging.info(
            f"No exact match found, highest ratio: {highest_ratio}")  # Logging the absence of an exact match and the highest ratio
        return []  # Returning an empty list

def remove_fuzzy_matches(text, phrases_dict, threshold=74):  # Defining the function for removing fuzzy matches
    words = text.split()  # Splitting the text into words
    removed_phrases = []  # Initializing the list of removed phrases

    for key, phrases in phrases_dict.items():  # For each key and phrases in the dictionary of phrases
        for phrase in phrases:  # For each phrase in the list of phrases
            phrase_words = phrase.split()  # Splitting the phrase into words
            phrase_len = len(phrase_words)  # Determining the length of the phrase
            indices_to_remove = set()  # Initializing the set of indices to remove

            for i in range(len(words) - phrase_len + 1):  # Iterating through the words in the text
                window = words[i:i + phrase_len]  # Defining the window of words
                window_text = ' '.join(window)  # Joining the window words into a string
                if fuzz.ratio(window_text, phrase) >= threshold:  # If the match ratio is greater than or equal to the threshold
                    indices_to_remove.update(range(i, i + phrase_len))  # Updating the indices to remove
                    removed_phrases.append(key)  # Adding the key to the list of removed phrases
                    logging.info(f'Removed phrase: {phrase} (matched key: {key})')  # Logging the removed phrase and key
                    break  # Breaking the loop after finding a match

            if indices_to_remove:  # If there are indices to remove
                words = [word for i, word in enumerate(words) if i not in indices_to_remove]  # Removing the matched phrases from the text
                break  # Breaking the outer loop after finding a match

    cleaned_text = ' '.join(words)  # Joining the remaining words into a string
    logging.info(f"Cleaned text after removal: {cleaned_text}")  # Logging the cleaned text
    return removed_phrases, cleaned_text  # Returning the list of removed phrases and the cleaned text

def handle_message(bot, message):  # Defining the function for handling messages
    try:  # Starting the try block for exception handling
        if not hasattr(message, 'text') or not message.text:  # Checking if the message has text
            bot.send_message(message.chat.id, "Message does not contain text", parse_mode='Markdown')  # Sending a message about the absence of text
            return  # Breaking the function execution

        text = message.text.lower()  # Converting the message text to lowercase
        logging.info(f"Received message: {text}")  # Logging the received message

        # Removing conjunctions before processing the text
        text = ' '.join([word for word in text.split() if word not in config.conjunctions])  # Removing conjunctions from the text
        logging.info(f"Text after removing conjunctions: {text}")  # Logging the text after removing conjunctions

        btn_phr_removed = remove_fuzzy_matches(text, button_phrases)  # Removing matched phrases for buttons from the text
        keywords = btn_phr_removed[0]  # Getting the list of keywords
        logging.info(f"Keywords after button phrase removal: {keywords}")  # Logging the keywords after removing button phrases

        new_text = btn_phr_removed[1]  # Getting the cleaned text
        keyword_matches = remove_fuzzy_matches(new_text, config.keywords)  # Removing matched keywords from the text
        keywords.extend(keyword_matches[0])  # Adding the matched keywords to the list of keywords

        logging.info(f'Final keywords: {keywords}')  # Logging the final keywords

        if keywords:  # If keywords are found
            send_buttons(bot, message, keywords)  # Sending buttons
        else:  # If no keywords are found
            bot.send_message(message.chat.id, "No relevant materials found with the entered content", parse_mode='Markdown')  # Sending a message about the absence of found materials
    except Exception as e:  # Catching any exception that may occur
        logging.error(f"Error handling message: {e}")  # Logging the error
        bot.send_message(message.chat.id, "An error has occurred", parse_mode='Markdown')  # Sending a message about the error

def open_site(bot, message):  # Defining the function for sending the site link
    bot.send_message(message.chat.id, 'Visit the ABC Finance website: <a href="https://www.abcfinance.am/">ABC Finance</a>', parse_mode='HTML')  # Sending a message with the site link

def open_loan_calculator(bot, message):  # Defining the function for sending the loan calculator link
    bot.send_message(message.chat.id, 'Use the Loan Calculator: <a href="https://www.abcfinance.am/calculators/loancalc.html">Loan Calculator</a>', parse_mode='HTML')  # Sending a message with the loan calculator link

def open_deposit_calculator(bot, message):  # Defining the function for sending the deposit calculator link
    bot.send_message(message.chat.id, 'Use the Deposit Calculator: <a href="https://www.abcfinance.am/calculators/depositcalc.html">Deposit Calculator</a>', parse_mode='HTML')  # Sending a message with the deposit calculator link

def open_pension_calculator(bot, message):  # Defining the function for sending the pension calculator link
    bot.send_message(message.chat.id, 'Use the Pension Calculator: <a href="https://www.abcfinance.am/calculators/pensioncalc.html">Pension Calculator</a>', parse_mode='HTML')  # Sending a message with the pension calculator link

def open_salary_calculator(bot, message):  # Defining the function for sending the salary calculator link
    bot.send_message(message.chat.id, 'Use the Salary Calculator: <a href="https://www.abcfinance.am/calculators/salarycalc.html">Salary Calculator</a>', parse_mode='HTML')  # Sending a message with the salary calculator link

def send_start_message(bot, message):  # Defining the function for sending the welcome message
    bot.send_message(message.chat.id, f'Hello, {message.from_user.first_name}!')  # Sending a welcome message with the user's name
    time.sleep(2)  # Pause for 2 seconds
    bot.send_message(message.chat.id, 'Choose the <b>“Financial Education”</b> section, then the desired topic, and you will receive the necessary information', parse_mode='HTML')  # Sending a message with instructions for choosing a topic
    time.sleep(2)  # Pause for 2 seconds
    bot.send_message(message.chat.id, 'By taking the <b>“Financial Test”</b>, you will check your financial knowledge and get ways to improve it', parse_mode='HTML')  # Sending a message with instructions for taking the financial test
    time.sleep(2)  # Pause for 2 seconds
    bot.send_message(message.chat.id, 'To use the calculators, click on the blue <b>“Menu”</b> button on the left', parse_mode='HTML')  # Sending a message with instructions for using the calculators
    time.sleep(2)  # Pause for 2 seconds

    markup = types.ReplyKeyboardMarkup(row_width=2, resize_keyboard=True)  # Creating an object of the keyboard with buttons
    itembtn1 = types.KeyboardButton('Financial Education')  # Creating the "Financial Education" button
    itembtn2 = types.KeyboardButton('Financial Test')  # Creating the "Financial Test" button
    markup.add(itembtn1, itembtn2)  # Adding buttons to the keyboard

    bot.send_message(message.chat.id, 'You can enter the words <b>“menu”</b> or press the <b>“Main Menu”</b> button to return to the main menu', parse_mode='HTML', reply_markup=markup)  # Sending a message with instructions and the keyboard

def send_main_menu_buttons(bot, message):  # Defining the function for sending the main menu
    markup = types.ReplyKeyboardMarkup(row_width=2, resize_keyboard=True)  # Creating an object of the keyboard with buttons
    itembtn1 = types.KeyboardButton('Financial Education')  # Creating the "Financial Education" button
    itembtn2 = types.KeyboardButton('Financial Test')  # Creating the "Financial Test" button
    markup.add(itembtn1, itembtn2)  # Adding buttons to the keyboard
    bot.send_message(message.chat.id, "Choose the appropriate section", reply_markup=markup)  # Sending a message with the keyboard

def send_finlearn_buttons(bot, message):  # Defining the function for sending the financial education buttons
    markup = types.ReplyKeyboardMarkup(row_width=4, resize_keyboard=True)  # Creating an object of the keyboard with buttons
    loan_btn = types.KeyboardButton('Loan')  # Creating the "Loan" button
    main_menu_btn = types.KeyboardButton('Main Menu')  # Creating the "Main Menu" button
    markup.row(loan_btn)  # Adding the button to the keyboard
    markup.row(main_menu_btn)  # Adding the button to the keyboard
    bot.send_message(message.chat.id, "Let's learn together", reply_markup=markup)  # Sending a message with the keyboard

1

Trang chủ Giới thiệu Sinh nhật bé trai Sinh nhật bé gái Tổ chức sự kiện Biểu diễn giải trí Dịch vụ khác Trang trí tiệc cưới Tổ chức khai trương Tư vấn dịch vụ Thư viện ảnh Tin tức - sự kiện Liên hệ Chú hề sinh nhật Trang trí YEAR END PARTY công ty Trang trí tất niên cuối năm Trang trí tất niên xu hướng mới nhất Trang trí sinh nhật bé trai Hải Đăng Trang trí sinh nhật bé Khánh Vân Trang trí sinh nhật Bích Ngân Trang trí sinh nhật bé Thanh Trang Thuê ông già Noel phát quà Biểu diễn xiếc khỉ Xiếc quay đĩa Dịch vụ tổ chức sự kiện 5 sao Thông tin về chúng tôi Dịch vụ sinh nhật bé trai Dịch vụ sinh nhật bé gái Sự kiện trọn gói Các tiết mục giải trí Dịch vụ bổ trợ Tiệc cưới sang trọng Dịch vụ khai trương Tư vấn tổ chức sự kiện Hình ảnh sự kiện Cập nhật tin tức Liên hệ ngay Thuê chú hề chuyên nghiệp Tiệc tất niên cho công ty Trang trí tiệc cuối năm Tiệc tất niên độc đáo Sinh nhật bé Hải Đăng Sinh nhật đáng yêu bé Khánh Vân Sinh nhật sang trọng Bích Ngân Tiệc sinh nhật bé Thanh Trang Dịch vụ ông già Noel Xiếc thú vui nhộn Biểu diễn xiếc quay đĩa Dịch vụ tổ chức tiệc uy tín Khám phá dịch vụ của chúng tôi Tiệc sinh nhật cho bé trai Trang trí tiệc cho bé gái Gói sự kiện chuyên nghiệp Chương trình giải trí hấp dẫn Dịch vụ hỗ trợ sự kiện Trang trí tiệc cưới đẹp Khởi đầu thành công với khai trương Chuyên gia tư vấn sự kiện Xem ảnh các sự kiện đẹp Tin mới về sự kiện Kết nối với đội ngũ chuyên gia Chú hề vui nhộn cho tiệc sinh nhật Ý tưởng tiệc cuối năm Tất niên độc đáo Trang trí tiệc hiện đại Tổ chức sinh nhật cho Hải Đăng Sinh nhật độc quyền Khánh Vân Phong cách tiệc Bích Ngân Trang trí tiệc bé Thanh Trang Thuê dịch vụ ông già Noel chuyên nghiệp Xem xiếc khỉ đặc sắc Xiếc quay đĩa thú vị
Trang chủ Giới thiệu Sinh nhật bé trai Sinh nhật bé gái Tổ chức sự kiện Biểu diễn giải trí Dịch vụ khác Trang trí tiệc cưới Tổ chức khai trương Tư vấn dịch vụ Thư viện ảnh Tin tức - sự kiện Liên hệ Chú hề sinh nhật Trang trí YEAR END PARTY công ty Trang trí tất niên cuối năm Trang trí tất niên xu hướng mới nhất Trang trí sinh nhật bé trai Hải Đăng Trang trí sinh nhật bé Khánh Vân Trang trí sinh nhật Bích Ngân Trang trí sinh nhật bé Thanh Trang Thuê ông già Noel phát quà Biểu diễn xiếc khỉ Xiếc quay đĩa
Thiết kế website Thiết kế website Thiết kế website Cách kháng tài khoản quảng cáo Mua bán Fanpage Facebook Dịch vụ SEO Tổ chức sinh nhật