Thiết kế website giá rẻ

Question

I have a folder named cleaned_texts. The folder contains text files(a.txt, b.txt, c.txt etc) and each text file contains tokenized words in this format:[‘Rise’, ‘of’, ‘e-health’, ‘and’, ‘its’, ‘Germany’, ‘dollar’].

Example:

a.txt contains [‘Rise’, ‘of’, ‘e-health’, ‘and’, ‘its’, ‘Thailand’, ‘YEN’, ‘India’] and

b.txt contains [‘PESO’, ‘Man’, ‘development’, ‘never’, ‘Japan’, ‘year’, ‘date’, ‘Canada’].

I also have another folder named StopWords which also contains text files and each text file contains a stop word. The text files are named in this format (currency.txt, names.txt, geographic.txt etc).

Example:

currency.txt contains names of currencies (Eg: BAHT | Thailand, PESO | Mexico, YEN | Japan etc).

geographic.txt contains names of countries (Eg: Canada, China, India, Germany etc).

I want to filter all the stop words contained in the text files inside the StopWords folder, from all the text files in the cleaned_texts folder.

I looped through the stop words folder, Combined all the stop words and converted it to a list. My challenge is how to filter the stop words from my cleaned_texts files. I have been on it for days now but i couldn’t figure out how to do it.

Here is my script:

<code>import glob

import codecs

import os

#Cleaned texts

os.getcwd()

clean_texts_folder = os.path.join(os.getcwd(), 'cleaned_texts')

clean_text_data = []

for root, folders, files in os.walk(clean_texts_folder):

for file in files:

path = os.path.join(root, file)

with codecs.open(path, encoding='utf-8', errors='ignore') as info:

clean_text_data.append(info.read())

#Stop Words

stopwords_folder_path = "StopWords"

stopwords_files = glob.glob(os.path.join(stopwords_folder_path, '*.txt'))

for file in stopwords_files:

with open(file, 'r') as w:

stop_words = w.read()

map_dict = {'|': ''}

res = ''.join(

idx if idx not in map_dict else map_dict[idx] for idx in stop_words)

new_list = res.split()

#new_list Output= ['SMITH', 'Surnames', 'from', '1990', 'Thailand', 'YEN', 'India', 'PESO', 'Japan', 'Canada']

#Trying to save the filtered texts

folder_name = "new_texts"

Path(folder).mkdir(parents=True, exist_ok=True)

filtered_sentence = []

for index, word in enumerate(clean_text_data):

if word not in new_list:

#print(filtered_sentence.append(word))

file_path = Path(folder_name, f"{index}.txt")

with pathlib.Path.open(file_path, "w", encoding="utf-8") as f:

f.write(f"{filtered_sentence }")

</code>

<code>import glob import codecs import os #Cleaned texts os.getcwd() clean_texts_folder = os.path.join(os.getcwd(), 'cleaned_texts') clean_text_data = [] for root, folders, files in os.walk(clean_texts_folder): for file in files: path = os.path.join(root, file) with codecs.open(path, encoding='utf-8', errors='ignore') as info: clean_text_data.append(info.read()) #Stop Words stopwords_folder_path = "StopWords" stopwords_files = glob.glob(os.path.join(stopwords_folder_path, '*.txt')) for file in stopwords_files: with open(file, 'r') as w: stop_words = w.read() map_dict = {'|': ''} res = ''.join( idx if idx not in map_dict else map_dict[idx] for idx in stop_words) new_list = res.split() #new_list Output= ['SMITH', 'Surnames', 'from', '1990', 'Thailand', 'YEN', 'India', 'PESO', 'Japan', 'Canada'] #Trying to save the filtered texts folder_name = "new_texts" Path(folder).mkdir(parents=True, exist_ok=True) filtered_sentence = [] for index, word in enumerate(clean_text_data): if word not in new_list: #print(filtered_sentence.append(word)) file_path = Path(folder_name, f"{index}.txt") with pathlib.Path.open(file_path, "w", encoding="utf-8") as f: f.write(f"{filtered_sentence }") </code>

import glob
import codecs
import os

#Cleaned texts
os.getcwd()
clean_texts_folder =  os.path.join(os.getcwd(), 'cleaned_texts')

clean_text_data = []
for root, folders, files in os.walk(clean_texts_folder):
    for file in files:
        path = os.path.join(root, file)
        with codecs.open(path, encoding='utf-8', errors='ignore') as info:
            clean_text_data.append(info.read())


#Stop Words
stopwords_folder_path = "StopWords"
stopwords_files = glob.glob(os.path.join(stopwords_folder_path, '*.txt'))

for file in stopwords_files:
    with open(file, 'r') as w:
        stop_words = w.read()
        
        map_dict = {'|': ''}
        res = ''.join(
            idx if idx not in map_dict else map_dict[idx] for idx in stop_words)
        new_list = res.split()

#new_list Output= ['SMITH', 'Surnames', 'from', '1990', 'Thailand', 'YEN', 'India', 'PESO', 'Japan', 'Canada']


#Trying to save the filtered texts
folder_name = "new_texts"
Path(folder).mkdir(parents=True, exist_ok=True)
filtered_sentence = []
for index, word in enumerate(clean_text_data):
    if word not in new_list:
        #print(filtered_sentence.append(word))
        file_path = Path(folder_name, f"{index}.txt")
        with pathlib.Path.open(file_path, "w", encoding="utf-8") as f:
           f.write(f"{filtered_sentence }")

Actual/Resulting Output:
“None” is printing in all the text files.

a.txt = None

b.txt = None

c.txt = None

Expected Output:

a.txt = [‘Rise’, ‘of’, ‘e-health’, ‘and’, ‘its’]

b.txt = [‘Man’, ‘development’, ‘never’,’year’, ‘date’]

Thiết kế website giá rẻ

Danh mục

Filtering stop words out of a multiple text files (using a list of stop words)