My program is supposed to load data from a CSV file, where the last part of every line is the label of the given data point. Since the labels will repeat themselves, it’s more efficient to simply store a list of those labels, and the data created inside the program can store indexes to the correct labels. The list of labels is implemented as such (handling of temp == NULL
will be added later):
typedef struct _LabelList {
size_t size;
char **label_table;
} LabelList;
void add_label(LabelList *list, char *label) {
if (list->label_table == NULL) {
list->size = 1;
char **temp = malloc(sizeof(char *));
if (temp != NULL) {
list->label_table = temp;
list->label_table[0] = label;
}
return;
}
else {
list->size++;
size_t temp_size = list->size;
char **temp = realloc(list->label_table, temp_size * sizeof(char *));
if (temp != NULL) {
list->label_table = temp;
list->label_table[temp_size-1] = label;
}
return;
}
}
The seemingly problematic function looks like this:
int check_if_exists(LabelList list, char *to_check) {
size_t length = list.size;
int exists;
for (size_t i = 0; i < length; i++) {
exists = strcmp(list.label_table[i], to_check);
if (exists == 0) return i;
}
return -1;
}
If the given char*
is already present in the list, the function returns its index in the list, otherwise it returns -1. Tested on its own, it works totally fine. The problem arises when it’s invoked by another function.
LabeledDataPoint *create_labeled_data_point(char *data, size_t rec_count, LabelList *label_list) {
char *token;
char *end;
LabeledDataPoint *point = malloc(sizeof(size_t) + rec_count*sizeof(double) + sizeof(char *));
point->dims = rec_count-1;
token = strtok(data, ",");
point->data[0] = atof(token);
for (size_t i = 1; i < rec_count-1; i++) {
token = strtok(NULL, ",");
point->data[i] = atof(token);
};
token = strtok(NULL, ",");
int index_in_list = check_if_exists(label_list, token);
if (index_in_list == -1) {
add_label(label_list, token);
point->label_num = label_list->size-1;
}
else {
point->label_num = index_in_list;
}
return point;
}
LabeledDataPoint **load_labeled_data_csv(FILE *file, size_t num_records, size_t num_fields, LabelList *label_list) {
LabeledDataPoint **all_data = malloc(num_records*(sizeof(size_t) + num_fields*sizeof(double)));
LabeledDataPoint *current_data_point;
char *current_record = NULL;
size_t n = 0;
int chars_num;
for (size_t i = 0; i < num_records; i++) {
chars_num = getline(¤t_record, &n, file);
if (chars_num != -1) {
current_data_point = create_labeled_data_point(current_record, num_fields, label_list);
}
all_data[i] = current_data_point;
}
free(current_record);
return all_data;
}
First label is added to label_list
just as anticipated, however, each subsequent run of check_if_exists
by create_labeled_data_point
always returns 0.
I tried my best to pinpoint the bug. It turns out that when I print the first element of the label_list
inside check_if_exists
, it’s always exactly the same as the to_check
argument, and that’s why strcmp
returns 0 every time. Also, first time around I tried implementing that with a linked list, but the issue was the same.
EDIT:
Thanks to the comments I’ve solved this issue. Two functions had to be changed:
LabeledDataPoint *create_labeled_data_point(char *data, size_t rec_count, LabelList *label_list) {
char *token;
char *end;
LabeledDataPoint *point = malloc(sizeof(size_t) + rec_count*sizeof(double) + sizeof(char *));
point->dims = rec_count-1;
token = strtok(data, ",");
point->data[0] = atof(token);
for (size_t i = 1; i < rec_count-1; i++) {
token = strtok(NULL, ",");
point->data[i] = atof(token);
};
token = strtok(NULL, ",");
// last token is copied, so the "data" pointer can be freed
char *token_copy = malloc(strlen(token)+1);
strcpy(token_copy, token);
int index_in_list = check_if_exists(*label_list, token);
// if a label is already in the list, the copied token is freed here
// otherwise that pointer will have to be freed after the list becomes obsolete
if (index_in_list == -1) {
add_label(label_list, token_copy);
point->label_num = label_list->size-1;
}
else {
point->label_num = index_in_list;
free(token_copy);
}
return point;
}
LabeledDataPoint **load_labeled_data_csv(FILE *file, size_t num_records, size_t num_fields, LabelList *label_list) {
LabeledDataPoint **all_data = malloc(num_records*(sizeof(size_t) + num_fields*sizeof(double)));
LabeledDataPoint *current_data_point;
char *current_record = NULL;
size_t n = 0;
int chars_num;
for (size_t i = 0; i < num_records; i++) {
chars_num = getline(¤t_record, &n, file);
if (chars_num != -1) {
current_data_point = create_labeled_data_point(current_record, num_fields, label_list);
}
all_data[i] = current_data_point;
// current_record is now freed and returned to NULL after each iteration
free(current_record);
current_record = NULL;
n = 0;
}
return all_data;
}
And for easy freeing of the used memory:
void free_label_list(LabelList *list) {
size_t size = list->size;
for (size_t i = 0; i < size; i++) {
free(list->label_table[i]);
}
free(list->label_table);
}
glacierDexeryl is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
4
You should use ",n"
as the separator list for your calls to strtok