The program works well overall; however, it encounters an issue when it processes accented words. When the program encounters words with accented characters, it stops functioning correctly. The exact reason for this interruption is unclear, and debugging is needed to identify the cause of this issue with accented words.
Through some debugging with various print statements added here and there, I was able to pinpoint that the problem lies within the readCharacters
function when it reads character by character.
The program accepts only C standard libraries.
#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <locale.h>
// Object structure containing a String, its frequency, and coordinates in the matrix
typedef struct {
wchar_t string[31];
float frequency;
int occurrence;
int x;
int y;
} Object;
// Matrix structure containing a list of objects and its dimensions
typedef struct {
Object** list;
int* x;
int y;
} Matrix;
// Alphabet structure containing a sub-alphabet and a word
struct Alphabet {
struct Alphabet* subalphabet[46];
Object* word;
};
typedef struct Alphabet Dictionary;
// Function prototypes
Dictionary* deallocate(Dictionary* dict);
void printMatrix(Matrix* matrix, FILE* file);
int asciiIndex(wchar_t character);
int endString(wchar_t string1[], Matrix* matrix, Dictionary* dict, int end);
void readCharacters(FILE* file, Matrix* matrix, Dictionary* dictionary);
// Main function
int main() {
setlocale(LC_ALL, "en_US.UTF-8"); // Set locale to support Unicode
FILE* inputFile = fopen("test.txt", "r");
FILE* outputFile = fopen("output.csv", "w");
if (inputFile == NULL || outputFile == NULL) { // Check if files opened successfully
exit(EXIT_FAILURE); // Exit with error if not
}
Dictionary* dictionary = (Dictionary*)calloc(1, sizeof(Dictionary)); // Allocate memory for dictionary
dictionary->word = (Object*)calloc(1, sizeof(Object)); // Allocate memory for a word in the dictionary
Matrix matrix;
matrix.y = 1;
matrix.x = (int*)calloc(1, sizeof(int));
matrix.x[0] = 1;
matrix.list = (Object**)calloc(1, matrix.y * sizeof(Object*)); // Allocate memory for the matrix
matrix.list[matrix.y - 1] = (Object*)calloc(1, 2 * sizeof(Object)); // Allocate memory for a row of the matrix
readCharacters(inputFile, &matrix, dictionary); // Call function to read characters from file
printMatrix(&matrix, outputFile); // Call function to print matrix to output file
fclose(outputFile); // Close output file
fclose(inputFile); // Close input file
for (int i = 0; i < matrix.y - 1; i++) { // Free allocated memory for matrix
free(matrix.list[i]);
}
free(matrix.list);
free(matrix.x);
for (int i = 0; i < 46; i++) { // Free allocated memory for dictionary
dictionary->subalphabet[i] = deallocate(dictionary->subalphabet[i]); // Deallocate sub-alphabet recursively
}
free(dictionary->word); // Free memory for the word
free(dictionary); // Free memory for the dictionary
return 0; // Return 0
}
// Function to deallocate memory for the dictionary (summary)
Dictionary* deallocate(Dictionary* dict) {
// Implementation omitted for brevity
return NULL;
}
// Function to convert character to an ASCII index
int asciiIndex(wchar_t character) {
wchar_t c = towlower(character); // Convert character to lowercase
if (iswalpha(c)) { // Handle non-accented letters
return c - L'a';
} else { // Handle accented characters
switch (c) {
case L'à': return 26;
case L'è': return 27;
case L'é': return 28;
case L'ì': return 29;
case L'ò': return 30;
case L'ù': return 31;
case L'!': return 32;
case L'?': return 33;
case L'.': return 34;
case L''': return 35;
}
}
if (c >= L'0' && c <= L'9') { // Handle numbers
return 36 + (c - L'0');
}
return -1; // Unrecognized character
}
// Function to handle the end of a string
int endString(wchar_t string1[], Matrix* matrix, Dictionary* dict, int end) {
if (string1[0] == L'') { // If string is empty
return 0; // Return 0
}
wcscpy(matrix->list[matrix->y - 1][1].string, string1); // Copy string to matrix
matrix->list[matrix->y - 1][1].x = 1; // Assign 1 to x coordinate in position 1
matrix->list[matrix->y - 1][0].x = 0; // Assign 0 to x coordinate in position 0
int check = operationalSearch(matrix->list[matrix->y - 1][0].string, 0, dict, matrix->y - 1, 1, matrix); // Call operationalSearch function
if (check == 0) {
matrix->y++;
matrix->list = realloc(matrix->list, matrix->y * sizeof(Object*)); // Reallocate memory for matrix
matrix->x = realloc(matrix->x, matrix->y * sizeof(int));
matrix->x[matrix->y - 1] = 1;
matrix->list[matrix->y - 1] = (Object*)calloc(1, 2 * sizeof(Object));
}
wcscpy(matrix->list[matrix->y - 1][0].string, string1); // Copy string to matrix
if (end == 1) { // If end is 1
wcscpy(matrix->list[matrix->y - 1][1].string, matrix->list[0][0].string); // Copy initial string to matrix
int check = operationalSearch(matrix->list[matrix->y - 1][0].string, 0, dict, matrix->y - 1, 1, matrix); // Call operationalSearch function
return 0; // Return 0
}
string1[0] = L''; // Reset string
return 0; // Return 0
}
// Function to read characters from the file and handle the temporary string
void readCharacters(FILE* file, Matrix* matrix, Dictionary* dictionary) {
wchar_t tempString[31]; // Declare a string of 30 characters
int index = 0; // Initialize the index to 0
wchar_t character; // Declare the character variable
int firstWord = 0;
const wchar_t* accentedChars = L"àèéìòóùÀÈÉÌÒÓÙ";
while ((character = fgetwc(file))) { // Read character from file
wchar_t nextChar = fgetwc(file); // Read next character
fseek(file, -1, SEEK_CUR); // Move file pointer back
if ((character == L' ' || character == L'n') && nextChar == WEOF) { // Handle end of file case
wcscpy(matrix->list[matrix->y - 1][1].string, matrix->list[0][0].string); // Copy initial string to matrix
operationalSearch(matrix->list[matrix->y - 1][0].string, 0, dictionary, matrix->y - 1, 1, matrix); // Check if key has been encountered in the text
return;
}
// Read the first word
if (firstWord == 0 && (character == L'n' || character == L' ')) { // Skip initial n or space characters
continue;
} else if (firstWord == 0 && (iswalnum(character) || (wcschr(accentedChars, character) != NULL))) { // Handle alphanumeric characters or accented characters
matrix->list[0][0].string[index] = character;
index++;
if (nextChar == L' ' || nextChar == L'n' || nextChar == L'!' || nextChar == L'?' || nextChar == L'.') { // Terminate string if followed by space, newline, or punctuation
matrix->list[0][0].string[index] = L'';
index = 0;
firstWord = 1;
} else if (nextChar == L''') { // Append apostrophe and terminate string
matrix->list[0][0].string[index] = L''';
matrix->list[0][0].string[index + 1] = L'';
index = 0;
firstWord = 1;
fseek(file, 1, SEEK_CUR); // Increment cursor to avoid reading apostrophe twice
}
continue;
} else if (firstWord == 0 && (character == L'!' || character == L'?' || character == L'.')) { // Handle special characters
matrix->list[0][0].string[index] = character;
matrix->list[0][0].string[index + 1] = L'';
firstWord = 1;
index = 0;
continue;
}
if (character == L'!' || character == L'?' || character == L'.') {
if (index > 0) {
tempString[index] = L''; // Terminate string
endString(tempString, matrix, dictionary, 0); // Handle end of string
index = 0;
}
tempString[0] = character;
tempString[1] = L'';
if (nextChar == WEOF) {
endString(tempString, matrix, dictionary, 1); // Handle end of file
break;
}
endString(tempString, matrix, dictionary, 0);
index = 0;
} else if (iswalnum(character) || (wcschr(accentedChars, character) != NULL) || character == L''') {
tempString[index++] = character; // Append character to tempString
wprintf(L"%lc", character);
if (nextChar == L' ' || nextChar == L'n' || nextChar == WEOF ||
nextChar == L'!' || nextChar == L'?' || nextChar == L'.' || character == L''') {
tempString[index] = L''; // Terminate string
endString(tempString, matrix, dictionary, (nextChar == WEOF) ? 1 : 0);
index = 0;
}
} else if (character == L' ' || character == L'n') {
if (index > 0) {
tempString[index] = L''; // Terminate string
endString(tempString, matrix, dictionary, 0);
index = 0;
}
} else if (!(iswalnum(character) || (wcschr(accentedChars, character) != NULL) ||
character == L''' || character == L'!' ||
character == L'?' || character == L'.') &&
nextChar == L' ' || nextChar == L'n') {
if (index > 0) {
tempString[index] = L''; // Terminate string
endString(tempString, matrix, dictionary, 0);
index = 0;
}
continue;
}
if (nextChar == WEOF) { // Break if end of file
break;
}
}
return;
}
// Function to print the matrix to the file
void printMatrix(Matrix* matrix, FILE* file) {
for (int i = 0; i <= matrix->y - 1; i++) {
if (matrix->list[i][1].frequency == 0) {
return;
}
fwprintf(file, L"%ls", matrix->list[i][0].string);
for (int j = 1; j <= matrix->x[i]; j++) {
fwprintf(file, L",%ls,%.4f", matrix->list[i][j].string, matrix->list[i][j].frequency);
}
if (i != matrix->y - 1) {
fwprintf(file, L"n");
}
}
}
This program is designed to read an Italian text file that may contain special characters such as .
, ?
, !
, and '
. It scans the words in the text and adds them to a CSV file, maintaining the frequency of each word. The program uses a matrix to store the words and their respective frequencies and coordinates, while a dictionary structure is used to manage the alphabet and sub-alphabets.
-
operationalSearch
: Searches for a string in the matrix and updates occurrences and frequencies. -
compareStrings
: Compares two wide character strings, considering case insensitivity. -
deallocate
: Recursively deallocates memory for the dictionary.