I am trying to shuffle rows and columns in C++ 11, where the aim is to retain the rows and column names. I am able to generate the shuffled matrix, but the names are not retained.
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <random>
#include <algorithm>
#include <sstream> // For std::istringstream
#include "util/base.hpp"
void writeMatrix(const std::vector<std::vector<double>>& matrix, const std::string& out_file_name) {
std::ofstream file(out_file_name);
for (const auto& row : matrix) {
for (size_t i = 0; i < row.size(); i++) {
file << row[i];
if (i != row.size() - 1) file << "t"; // Use tab as delimiter
}
file << "n";
}
file.close();
}
// Function to permute a matrix
void permuteMatrix(std::vector<std::vector<double>>& matrix, std::mt19937& g) {
for (auto& row : matrix) {
std::shuffle(row.begin(), row.end(), g);
}
}
int main(int argc, char* argv[]) {
if (argc < 5) {
std::cerr << "Usage: " << argv[0] << " <input file> <output file base> <number of permutations> <seed>n";
return 1;
}
std::string inputFile = argv[1];
std::string outputFileBase = argv[2];
int numPermutations = std::stoi(argv[3]);
int seed = std::stoi(argv[4]);
// read file
std::vector<std::string> annotation_vec;
std::vector<std::string> gene_name_vec;
std::vector<std::vector<double>> matrix;
load(inputFile, annotation_vec, gene_name_vec, matrix);
// Create a random number generator
std::mt19937 g(seed);
// Perform permutations and write to files
for (int i = 0; i < numPermutations; ++i) {
auto permutedMatrix = matrix; // Copy original matrix to preserve it
permuteMatrix(permutedMatrix, g); // Shuffle each row
// Create a filename for each permutation
std::string out_file_name = outputFileBase + "_permutation" + std::to_string(i + 1) + ".txt";
writeMatrix(permutedMatrix, out_file_name);
}
return 0;
}
The base.hpp can be found on github, essentially the aim is to undertake random permutations of the original matrix, and then to undertake pearson/spearman correlation and investigate if the fdr of the random permutated correlation pairs are higher than from a biological sample, as a measure to address the null hypothesis.