I have a function that is used in a script that I am writing to remove redundant blocking keywords from a list. Basically, with the input (in any order) of:
{"apple", "bapple", "banana", "cherry", "bananaman", "sweetherrypie", "sweet", "b"}
It should output a shrunk array of strings (in any order) of:
{"apple", "cherry", "sweet", "b"}
This is done to reduce the number of keywords a matching algorithm needs to pass through as a personal learning project (If a string contained “banana”, it would also contain “b” so banana is unnecessary, and can be discarded
It needed to be extremely performant, as it had to run on lists of over 10,000,000 strings.
Initially, I wrote some Python code to do this:
def deduplicate_keywords(keywords):
# Remove duplicates and sort the keywords by length in ascending order
keywords = sorted(set(keywords), key=len)
unique_keywords = []
for keyword in keywords:
not_redundant = True
for unique_keyword in unique_keywords:
# Check if the current keyword includes any of the unique keywords
if unique_keyword in keyword:
not_redundant = False
break
# If the keyword does not include any of the shorter keywords, add it to unique_keywords
if not_redundant:
unique_keywords.append(keyword)
return unique_keywords
And a performance benchmark to measure its performance:
class TestDeduplicateKeywords(unittest.TestCase):
def test_performance_intensive_case(self):
keywords = [
"abcdef",
"def",
"ghidef",
"jkl",
"mnop",
"qrst",
"uvwxyz",
"ghijkl",
"abcdefgh",
"ijklmnop",
"mnopqrstuv",
"rstuvwx",
"mnopqrstuvwx",
"uvwx",
]
# Extending the list to make it very long and complex
base_keywords = keywords[:]
for i in range(200000):
base_keywords.extend([f"{k}{i}" for k in keywords])
base_keywords.extend([f"{i}{k}" for k in keywords])
base_keywords.extend([f"{i}{k}{i}" for k in keywords])
keywords = base_keywords
old_function_time = 0
new_function_time = 0
for repeat in range(1):
# This loop (not present in the cpp code) is used to loop it multiple times to find an average
start_time = time.time()
result = deduplicate_keywords(keywords)
end_time = time.time()
old_function_time = (
repeat * old_function_time + (end_time - start_time)
) / (repeat + 1)
start_time = time.time()
result = deduplicate_keywords_new(keywords)
# Above is a function that can be edited to check whether changes speed up the code
end_time = time.time()
new_function_time = (
repeat * new_function_time + (end_time - start_time)
) / (repeat + 1)
# As the list is extended with numbers, the result will be unique non-redundant keywords
expected_result = ["def", "jkl", "mnop", "qrst", "uvwx"]
self.assertEqual(set(result), set(expected_result))
print(
f"Test performance_intensive_case for old function took {old_function_time} secondsn"
f"Test performance_intensive_case for new function took {new_function_time} secondsn"
)
if old_function_time < new_function_time:
print(f"Old is faster than new")
else:
print(f"New is faster than old")
if __name__ == "__main__":
unittest.main()
This was too slow for my needs at 12 seconds
on my weak laptop.
Therefore, I tried writing the same thing in cpp to see whether I would net a performance gain. It was so much slower, so I added some timing code to the function itself to see where the slowdown was:
#include <iostream>
#include <vector>
#include <string>
#include <set>
#include <algorithm>
#include <chrono>
#include <cassert>
std::vector<std::string> deduplicate_keywords(const std::vector<std::string>& keywords) {
auto t_start = std::chrono::high_resolution_clock::now();
// Remove duplicates using a set
auto t1 = std::chrono::high_resolution_clock::now();
std::cout << "started old function!" << std::endl;
std::set<std::string> unique_set(keywords.begin(), keywords.end());
auto t2 = std::chrono::high_resolution_clock::now();
std::cout << "removed duplicates in " << std::chrono::duration<double>(t2 - t1).count() << " seconds" << std::endl;
// Convert set back to vector and sort by length
auto t3 = std::chrono::high_resolution_clock::now();
std::vector<std::string> sorted_keywords(unique_set.begin(), unique_set.end());
std::sort(sorted_keywords.begin(), sorted_keywords.end(), [](const std::string& a, const std::string& b) {
return a.length() < b.length();
});
auto t4 = std::chrono::high_resolution_clock::now();
std::cout << "Sorted by length in " << std::chrono::duration<double>(t4 - t3).count() << " seconds" << std::endl;
// Filter out redundant keywords
auto t5 = std::chrono::high_resolution_clock::now();
std::vector<std::string> unique_keywords;
for (const auto& keyword : sorted_keywords) {
for (const auto& unique_keyword : unique_keywords) {
// Check if the current keyword includes any of the unique keywords
if (keyword.find(unique_keyword) != std::string::npos) {
goto next_iteration;
}
}
// If the keyword does not include any of the shorter keywords, add it to unique_keywords
unique_keywords.push_back(keyword);
next_iteration:
continue;
}
auto t6 = std::chrono::high_resolution_clock::now();
std::cout << "Filtered in " << std::chrono::duration<double>(t6 - t5).count() << " seconds" << std::endl;
auto t7 = std::chrono::high_resolution_clock::now();
sorted_keywords.clear();
sorted_keywords.shrink_to_fit();
unique_keywords.shrink_to_fit();
auto t8 = std::chrono::high_resolution_clock::now();
std::cout << "Destructed in " << std::chrono::duration<double>(t8 - t7).count() << " seconds" << std::endl;
auto t9 = std::chrono::high_resolution_clock::now();
auto total_time = std::chrono::duration<double>(t9 - t_start).count();
std::cout << "Total function time: " << total_time << " seconds" << std::endl;
return unique_keywords;
}
void test_performance_intensive_case() {
std::vector<std::string> keywords = {
"abcdef", "def", "ghidef", "jkl", "mnop", "qrst", "uvwxyz",
"ghijkl", "abcdefgh", "ijklmnop", "mnopqrstuv", "rstuvwx",
"mnopqrstuvwx", "uvwx"
};
// Extending the list to make it very long and complex
std::vector<std::string> base_keywords = keywords;
for (int i = 0; i < 20000; ++i) {
for (const auto& k : keywords) {
base_keywords.push_back(k + std::to_string(i));
base_keywords.push_back(std::to_string(i) + k);
base_keywords.push_back(std::to_string(i) + k + std::to_string(i));
}
}
auto keywords_extended = base_keywords;
std::set<std::string> expected_result = {"def", "jkl", "mnop", "qrst", "uvwx"};
// there is no looping to calculate an average because it is so much slower
auto start_time_total = std::chrono::high_resolution_clock::now();
auto start_time = std::chrono::high_resolution_clock::now();
std::vector<std::string> result_old = deduplicate_keywords(keywords_extended);
auto end_time = std::chrono::high_resolution_clock::now();
double old_function_time = std::chrono::duration<double>(end_time - start_time).count();
auto end_time_total = std::chrono::high_resolution_clock::now();
double total_test_time = std::chrono::duration<double>(end_time_total - start_time_total).count();
assert(std::set<std::string>(result_old.begin(), result_old.end()) == expected_result);
std::cout << "old function time: " << old_function_time << " seconds" << std::endl;
std::cout << "Total test case time: " << total_test_time << " seconds" << std::endl;
auto start_time_total_new = std::chrono::high_resolution_clock::now();
auto start_time_new = std::chrono::high_resolution_clock::now();
std::vector<std::string>result_new = deduplicate_keywords_new(keywords_extended); // same as in the python code, useful for comparison
auto end_time_new = std::chrono::high_resolution_clock::now();
double new_function_time = std::chrono::duration<double>(end_time - start_time).count();
auto end_time_total_new = std::chrono::high_resolution_clock::now();
double total_test_time_new = std::chrono::duration<double>(end_time_total_new - start_time_total_new).count();
assert(std::set<std::string>(result_new.begin(), result_new.end()) == expected_result);
std::cout << "new function time: " << new_function_time << " seconds" << std::endl;
std::cout << "Total test case time: " << total_test_time_new << " seconds" << std::endl;
std::cout << "Test performance_intensive_case for old function took " << old_function_time
<< " seconds but new function took " << new_function_time << " seconds" << std::endl;
if (old_function_time < new_function_time) {
std::cout << "Old is faster than new" << std::endl;
} else {
std::cout << "New is faster than old" << std::endl;
}
}
int main() {
test_performance_intensive_case();
return 0;
}
This outputted to the terminal this (shortened, removing the logs for the currently empty deduplicate_keywords_new function):
started old function!
removed duplicates in 9.64173 seconds
Sorted by length in 3.36138 seconds
Filtered in 0.366933 seconds
Destructed in 7.63208 seconds
Total function time: 21.0037 seconds
old function time: 405.213 seconds
Total test case time: 405.213 seconds
Aside from the gereral speed reduction, the logs seem to indicate that returning the function (return unique keywords
) seems to take extremely long. Why is this? And what can I do to stop this? I am fairly new to cpp coding and development, so it would be great if your explainations were detailed and thorough. Thanks!
user26571886 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
1