Thiết kế website giá rẻ

Question

I am developing a Telegram bot using Python and facing challenges with keyword recognition and extraction. Despite trying various methods like basic string matching, regular expressions, and NLP libraries (spaCy, NLTK), I am unable to achieve consistent keyword recognition and efficient performance with larger datasets.

I have implemented functions for normalizing text, fuzzy matching, and handling messages. However, the recognition remains inconsistent, and the performance drops significantly as the dataset grows. I expect to improve the accuracy of keyword recognition and maintain efficient performance.

Expected: I expected the bot to consistently recognize keywords and respond with appropriate buttons.

Result: The recognition remains inconsistent, and the performance drops significantly as the dataset grows.

Questions:

What are the best practices for keyword recognition in Telegram bots?
Are there any optimized libraries or tools for this purpose?
Could someone provide a working example or point to relevant resources?

import re  # Importing the module for working with regular expressions
import difflib  # Importing the module for sequence comparison
import spacy  # Importing the spaCy library for natural language processing
import nltk  # Importing the NLTK library for natural language processing
from fuzzywuzzy import fuzz  # Importing the module for fuzzy string matching
from nltk.corpus import stopwords  # Importing the list of stop words from NLTK
from nltk.stem import WordNetLemmatizer  # Importing the lemmatizer from NLTK
from telebot import types  # Importing the types module from the telebot library for creating buttons
import config  # Importing the config file containing settings
import time  # Importing the module for working with time
from button_functions import send_buttons  # Importing the function for sending buttons
from buttons import (  # Importing functions for sending various buttons
    send_calculators_buttons,
    send_loan_buttons,
    send_sub_buttons_hipotek,
    send_sub_buttons_grav,
    send_sub_buttons_acra,
)
import logging  # Importing the module for logging
from config import button_phrases  # Importing button phrases from config

# Loading NLTK resources
nltk.download('punkt')  # Downloading the tokenizer from NLTK
nltk.download('stopwords')  # Downloading the stop words from NLTK
nltk.download('wordnet')  # Downloading the WordNet lemmatizer from NLTK

# Loading spaCy models
nlp_en = spacy.load("en_core_web_sm")  # Loading the model for English
nlp_ru = spacy.load("ru_core_news_sm")  # Loading the model for Russian
# nlp_hy = spacy.load("hy_core_news_sm")  # Loading the model for Armenian

# Initializing the lemmatizer and the list of stop words
lemmatizer = WordNetLemmatizer()  # Initializing the lemmatizer
stop_words = set(stopwords.words('english')).union(set(stopwords.words('russian'))).union(
    set(config.conjunctions))  # Combining the lists of stop words for English, Russian, and conjunctions

def normalize_text(text, lang='en'):  # Defining the function for normalizing text
    text = re.sub(r'[^ws]', '', text)  # Removing all characters except letters and spaces using a regular expression
    logging.info(f"Text after regex cleaning: {text}")  # Logging the text after removing characters

    # Removing conjunctions from the text
    words = text.split()  # Splitting the text into individual words
    words = [word for word in words if word not in config.conjunctions]  # Removing all conjunctions from the text using the list of conjunctions from the config file
    text = ' '.join(words)  # Joining the remaining words back into a text string

    logging.info(f"Text after removing conjunctions: {text}")  # Logging the text after removing conjunctions

    if lang == 'ru':  # If the language of the text is Russian
        doc = nlp_ru(text)  # Processing the text with the Russian language model
    elif lang == 'hy':  # If the language of the text is Armenian
        doc = text.split()  # Simple example of text processing for Armenian
    else:  # If the language of the text is English
        doc = nlp_en(text)  # Processing the text with the English language model

    if lang == 'hy':  # If the language of the text is Armenian
        normalized_tokens = [token.lower() for token in doc if not re.match(r'W', token)]  # Normalizing the text for Armenian by removing all non-alphabetic characters
    else:  # If the language of the text is English or Russian
        normalized_tokens = [token.lemma_.lower() for token in doc if not token.is_punct and not token.is_space]  # Normalizing the text for English and Russian using lemmatization

    normalized_text = " ".join(normalized_tokens)  # Joining the normalized tokens into a string
    logging.info(f"Normalized text: {normalized_text}")  # Logging the normalized text
    return normalized_text  # Returning the normalized text

def fuzzy_match(input_text, phrases):  # Defining the function for fuzzy matching
    best_match = None  # Initializing the variable for the best match
    highest_ratio = 0  # Initializing the variable for the highest ratio
    normalized_input = normalize_text(input_text)  # Normalizing the input text

    for phrase in phrases:  # For each phrase in the list of phrases
        normalized_phrase = normalize_text(phrase)  # Normalizing the phrase
        ratio = fuzz.ratio(normalized_input, normalized_phrase)  # Comparing the text and the phrase
        if ratio > highest_ratio:  # If the current ratio is higher than the highest ratio
            highest_ratio = ratio  # Updating the highest ratio
            best_match = phrase  # Updating the best phrase

    logging.info(f"Fuzzy match result: {best_match} with ratio: {highest_ratio}")  # Logging the result of the fuzzy match
    return best_match, highest_ratio  # Returning the best phrase and the highest ratio

def get_best_match(input_text, phrases):  # Defining the function for finding the best match
    input_text = normalize_text(input_text)  # Normalizing the input text
    normalized_phrases = [normalize_text(phrase) for phrase in phrases]  # Normalizing the phrases

    matches = difflib.get_close_matches(input_text, normalized_phrases, n=1, cutoff=0.8)  # Finding the closest matches
    if matches:  # If matches are found
        best_match_index = normalized_phrases.index(matches[0])  # Getting the index of the best match
        best_match = phrases[best_match_index]  # Getting the best phrase
        ratio = difflib.SequenceMatcher(None, input_text, matches[0]).ratio() * 100  # Calculating the match percentage
        logging.info(f"Best match found: {best_match} with ratio: {ratio}")  # Logging the result of the match
        return best_match, ratio  # Returning the best phrase and the match percentage
    logging.info("No match found")  # Logging the absence of matches
    return None, 0  # Returning None and 0 if no matches are found

def find_best_match(input_text, config_section):  # Defining the function for finding the best match with the configuration
    best_match, highest_ratio = fuzzy_match(input_text, config_section)  # Performing fuzzy matching
    if highest_ratio < 80:  # If the highest ratio is less than 80
        best_match, highest_ratio = get_best_match(input_text, config_section)  # Performing closest match search
    return best_match, highest_ratio  # Returning the best phrase and the highest ratio

def get_matching_keywords(message, threshold=81):  # Defining the function for getting matching keywords
    text = message.text.lower()  # Converting the message text to lowercase
    logging.info(f"get_matching_keywords called with text: {text}")  # Logging the function call with text

    best_match = None  # Initializing the variable for the best match
    highest_ratio = 0  # Initializing the variable for the highest ratio

    # Removing conjunctions
    text = ' '.join([word for word in text.split() if word not in config.conjunctions])  # Removing conjunctions from the text
    logging.info(f"Text after removing conjunctions: {text}")  # Logging the text after removing conjunctions

    # Checking for matches with button phrases
    for button, phrases in config.button_phrases.items():  # For each button and phrases in the dictionary of button phrases
        for phrase in phrases:  # For each phrase in the list of phrases
            best_match_phrase, highest_ratio_phrase = fuzzy_match(text, [
                phrase.lower()])  # Performing fuzzy matching with button phrases
            if highest_ratio_phrase > highest_ratio:  # If the current ratio is higher than the highest ratio
                best_match = button  # Updating the best button
                highest_ratio = highest_ratio_phrase  # Updating the highest ratio

    # Checking for matches with keywords
    for key, words in config.keywords.items():  # For each key and words in the dictionary of keywords
        for word in words:  # For each word in the list of words
            best_match_word, highest_ratio_word = fuzzy_match(text, [
                word.lower()])  # Performing fuzzy matching with keywords
            if highest_ratio_word > highest_ratio:  # If the current ratio is higher than the highest ratio
                best_match = key  # Updating the best key
                highest_ratio = highest_ratio_word  # Updating the highest ratio

    if highest_ratio >= threshold:  # If the highest ratio is greater than or equal to the threshold
        logging.info(
            f"Best matched keyword: {best_match} with ratio: {highest_ratio}")  # Logging the best matching keyword and the ratio
        return [best_match]  # Returning the list with the best keyword
    else:  # If the highest ratio is less than the threshold
        logging.info(
            f"No exact match found, highest ratio: {highest_ratio}")  # Logging the absence of an exact match and the highest ratio
        return []  # Returning an empty list

def remove_fuzzy_matches(text, phrases_dict, threshold=74):  # Defining the function for removing fuzzy matches
    words = text.split()  # Splitting the text into words
    removed_phrases = []  # Initializing the list of removed phrases

    for key, phrases in phrases_dict.items():  # For each key and phrases in the dictionary of phrases
        for phrase in phrases:  # For each phrase in the list of phrases
            phrase_words = phrase.split()  # Splitting the phrase into words
            phrase_len = len(phrase_words)  # Determining the length of the phrase
            indices_to_remove = set()  # Initializing the set of indices to remove

            for i in range(len(words) - phrase_len + 1):  # Iterating through the words in the text
                window = words[i:i + phrase_len]  # Defining the window of words
                window_text = ' '.join(window)  # Joining the window words into a string
                if fuzz.ratio(window_text, phrase) >= threshold:  # If the match ratio is greater than or equal to the threshold
                    indices_to_remove.update(range(i, i + phrase_len))  # Updating the indices to remove
                    removed_phrases.append(key)  # Adding the key to the list of removed phrases
                    logging.info(f'Removed phrase: {phrase} (matched key: {key})')  # Logging the removed phrase and key
                    break  # Breaking the loop after finding a match

            if indices_to_remove:  # If there are indices to remove
                words = [word for i, word in enumerate(words) if i not in indices_to_remove]  # Removing the matched phrases from the text
                break  # Breaking the outer loop after finding a match

    cleaned_text = ' '.join(words)  # Joining the remaining words into a string
    logging.info(f"Cleaned text after removal: {cleaned_text}")  # Logging the cleaned text
    return removed_phrases, cleaned_text  # Returning the list of removed phrases and the cleaned text

def handle_message(bot, message):  # Defining the function for handling messages
    try:  # Starting the try block for exception handling
        if not hasattr(message, 'text') or not message.text:  # Checking if the message has text
            bot.send_message(message.chat.id, "Message does not contain text", parse_mode='Markdown')  # Sending a message about the absence of text
            return  # Breaking the function execution

        text = message.text.lower()  # Converting the message text to lowercase
        logging.info(f"Received message: {text}")  # Logging the received message

        # Removing conjunctions before processing the text
        text = ' '.join([word for word in text.split() if word not in config.conjunctions])  # Removing conjunctions from the text
        logging.info(f"Text after removing conjunctions: {text}")  # Logging the text after removing conjunctions

        btn_phr_removed = remove_fuzzy_matches(text, button_phrases)  # Removing matched phrases for buttons from the text
        keywords = btn_phr_removed[0]  # Getting the list of keywords
        logging.info(f"Keywords after button phrase removal: {keywords}")  # Logging the keywords after removing button phrases

        new_text = btn_phr_removed[1]  # Getting the cleaned text
        keyword_matches = remove_fuzzy_matches(new_text, config.keywords)  # Removing matched keywords from the text
        keywords.extend(keyword_matches[0])  # Adding the matched keywords to the list of keywords

        logging.info(f'Final keywords: {keywords}')  # Logging the final keywords

        if keywords:  # If keywords are found
            send_buttons(bot, message, keywords)  # Sending buttons
        else:  # If no keywords are found
            bot.send_message(message.chat.id, "No relevant materials found with the entered content", parse_mode='Markdown')  # Sending a message about the absence of found materials
    except Exception as e:  # Catching any exception that may occur
        logging.error(f"Error handling message: {e}")  # Logging the error
        bot.send_message(message.chat.id, "An error has occurred", parse_mode='Markdown')  # Sending a message about the error

def open_site(bot, message):  # Defining the function for sending the site link
    bot.send_message(message.chat.id, 'Visit the ABC Finance website: <a href="https://www.abcfinance.am/">ABC Finance</a>', parse_mode='HTML')  # Sending a message with the site link

def open_loan_calculator(bot, message):  # Defining the function for sending the loan calculator link
    bot.send_message(message.chat.id, 'Use the Loan Calculator: <a href="https://www.abcfinance.am/calculators/loancalc.html">Loan Calculator</a>', parse_mode='HTML')  # Sending a message with the loan calculator link

def open_deposit_calculator(bot, message):  # Defining the function for sending the deposit calculator link
    bot.send_message(message.chat.id, 'Use the Deposit Calculator: <a href="https://www.abcfinance.am/calculators/depositcalc.html">Deposit Calculator</a>', parse_mode='HTML')  # Sending a message with the deposit calculator link

def open_pension_calculator(bot, message):  # Defining the function for sending the pension calculator link
    bot.send_message(message.chat.id, 'Use the Pension Calculator: <a href="https://www.abcfinance.am/calculators/pensioncalc.html">Pension Calculator</a>', parse_mode='HTML')  # Sending a message with the pension calculator link

def open_salary_calculator(bot, message):  # Defining the function for sending the salary calculator link
    bot.send_message(message.chat.id, 'Use the Salary Calculator: <a href="https://www.abcfinance.am/calculators/salarycalc.html">Salary Calculator</a>', parse_mode='HTML')  # Sending a message with the salary calculator link

def send_start_message(bot, message):  # Defining the function for sending the welcome message
    bot.send_message(message.chat.id, f'Hello, {message.from_user.first_name}!')  # Sending a welcome message with the user's name
    time.sleep(2)  # Pause for 2 seconds
    bot.send_message(message.chat.id, 'Choose the <b>“Financial Education”</b> section, then the desired topic, and you will receive the necessary information', parse_mode='HTML')  # Sending a message with instructions for choosing a topic
    time.sleep(2)  # Pause for 2 seconds
    bot.send_message(message.chat.id, 'By taking the <b>“Financial Test”</b>, you will check your financial knowledge and get ways to improve it', parse_mode='HTML')  # Sending a message with instructions for taking the financial test
    time.sleep(2)  # Pause for 2 seconds
    bot.send_message(message.chat.id, 'To use the calculators, click on the blue <b>“Menu”</b> button on the left', parse_mode='HTML')  # Sending a message with instructions for using the calculators
    time.sleep(2)  # Pause for 2 seconds

    markup = types.ReplyKeyboardMarkup(row_width=2, resize_keyboard=True)  # Creating an object of the keyboard with buttons
    itembtn1 = types.KeyboardButton('Financial Education')  # Creating the "Financial Education" button
    itembtn2 = types.KeyboardButton('Financial Test')  # Creating the "Financial Test" button
    markup.add(itembtn1, itembtn2)  # Adding buttons to the keyboard

    bot.send_message(message.chat.id, 'You can enter the words <b>“menu”</b> or press the <b>“Main Menu”</b> button to return to the main menu', parse_mode='HTML', reply_markup=markup)  # Sending a message with instructions and the keyboard

def send_main_menu_buttons(bot, message):  # Defining the function for sending the main menu
    markup = types.ReplyKeyboardMarkup(row_width=2, resize_keyboard=True)  # Creating an object of the keyboard with buttons
    itembtn1 = types.KeyboardButton('Financial Education')  # Creating the "Financial Education" button
    itembtn2 = types.KeyboardButton('Financial Test')  # Creating the "Financial Test" button
    markup.add(itembtn1, itembtn2)  # Adding buttons to the keyboard
    bot.send_message(message.chat.id, "Choose the appropriate section", reply_markup=markup)  # Sending a message with the keyboard

def send_finlearn_buttons(bot, message):  # Defining the function for sending the financial education buttons
    markup = types.ReplyKeyboardMarkup(row_width=4, resize_keyboard=True)  # Creating an object of the keyboard with buttons
    loan_btn = types.KeyboardButton('Loan')  # Creating the "Loan" button
    main_menu_btn = types.KeyboardButton('Main Menu')  # Creating the "Main Menu" button
    markup.row(loan_btn)  # Adding the button to the keyboard
    markup.row(main_menu_btn)  # Adding the button to the keyboard
    bot.send_message(message.chat.id, "Let's learn together", reply_markup=markup)  # Sending a message with the keyboard

Thiết kế website giá rẻ

Danh mục

Improving keyword recognition and extraction in a Telegram bot [closed]