LCOV - code coverage report
Current view: top level - dtwc - fileOperations.hpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 80 90 88.9 %
Date: 2024-09-07 20:53:22 Functions: 6 6 100.0 %

          Line data    Source code
       1             : /**
       2             :  * @file fileOperations.hpp
       3             :  * @brief Functions for file operations
       4             :  *
       5             :  * @details This header file declares various functions for performing file operations such as
       6             :  * reading and writing data to/from files. It includes functions to handle comma-separated values (CSV) files,
       7             :  * read data into vectors or Armadillo matrices, and save matrices to files.
       8             :  * It provides the functionality to ignore Byte Order Marks (BOM) in text files,
       9             :  * read specific rows and columns from files, and handle data from directories or batch files.
      10             :  *
      11             :  * @date 21 Jan 2022
      12             :  * @author Volkan Kumtepeli
      13             :  * @author Becky Perriment
      14             :  */
      15             : 
      16             : #pragma once
      17             : 
      18             : #include "settings.hpp" // for resultsPath
      19             : 
      20             : #include <cassert>    // for assert
      21             : #include <chrono>     // for filesystem
      22             : #include <cstdlib>    // for size_t
      23             : #include <filesystem> // for operator<<, path, operator/, directory_iterator
      24             : #include <iostream>   // for operator<<, ifstream, basic_ostream, operator>>
      25             : #include <string>     // for string, getline, to_string
      26             : #include <utility>    // for pair
      27             : #include <vector>     // for vector
      28             : #include <fstream>
      29             : #include <string>
      30             : #include <sstream>
      31             : #include <stdexcept> // for std::runtime_error
      32             : 
      33             : #include <armadillo>
      34             : 
      35             : namespace dtwc {
      36             : 
      37             : namespace fs = std::filesystem;
      38             : 
      39             : /**
      40             :  * @brief Ignores Byte Order Mark (BOM) in UTF-8 encoded files.
      41             :  *
      42             :  * @param in Reference to the input stream to process.
      43             :  */
      44         466 : inline void ignoreBOM(std::istream &in)
      45             : {
      46         466 :   char BOMchars[] = { '\xEF', '\xBB', '\xBF' };
      47         466 :   int seek = 0;
      48         466 :   char c = '.';
      49         469 :   while (in >> c) {
      50         362 :     if (BOMchars[seek] != c) {
      51         359 :       in.putback(c);
      52         359 :       break;
      53             :     }
      54           3 :     seek++;
      55             :   }
      56         466 :   in.clear(); // Clear EOF flag if end of file was reached
      57         466 : }
      58             : 
      59             : /**
      60             :  * @brief Reads a file and returns the data as a vector of a specified type.
      61             :  *
      62             :  * @tparam data_t The data type of the elements to be read.
      63             :  * @param name Path of the file to read.
      64             :  * @param start_row Starting row index for reading the data (default is 0).
      65             :  * @param start_col Starting column index for reading the data (default is 0).
      66             :  * @param delimiter Delimiter character used in the file (default is ',').
      67             :  * @return std::vector<data_t> A vector containing the read data.
      68             :  */
      69             : template <typename data_t>
      70         462 : auto readFile(const fs::path &name, int start_row = 0, int start_col = 0, char delimiter = ',')
      71             : {
      72         462 :   std::ifstream in(name, std::ios_base::in);
      73         462 :   if (!in.good()) // check if we could open the file
      74             :   {
      75           0 :     std::cerr << "Error in readFile. File " << name << " could not be opened.\n";
      76           0 :     std::runtime_error("");
      77             :   }
      78             : 
      79         462 :   ignoreBOM(in);
      80             : 
      81             :   // https://stackoverflow.com/questions/70497719/read-from-comma-separated-file-into-vector-of-objects
      82         462 :   std::string line{};
      83         462 :   char c = '.';
      84             : 
      85         472 :   for (int i = 0; i < start_row; i++) // Skip first start_row rows to start from start_row.
      86          10 :     std::getline(in, line);
      87             : 
      88             : 
      89             :   data_t temp, p_i;
      90         462 :   std::vector<data_t> p;
      91         462 :   p.reserve(10000);
      92             : 
      93      237294 :   while (std::getline(in, line)) {
      94      118416 :     std::istringstream iss(line);
      95             : 
      96      236832 :     for (int i = 0; i < start_col; i++) // Skip first start_col columns to start from start_col.
      97             :     {
      98      118416 :       iss >> temp;
      99      118416 :       if (delimiter != ' ' && delimiter != '\t') // These we do not need to remove from stream.
     100      118416 :         iss >> c;
     101             :     }
     102             : 
     103      118416 :     iss >> p_i; // Finally we got it!  % #TODO does not work for many-column arrays.
     104      118416 :     p.push_back(p_i);
     105             :   }
     106             : 
     107         462 :   p.shrink_to_fit();
     108         924 :   return p;
     109         462 : }
     110             : 
     111             : /**
     112             :  * @brief Loads all files from a given folder and returns their data as vectors along with file names.
     113             :  *
     114             :  * @tparam data_t The data type of the elements to be read.
     115             :  * @tparam Tpath Type of the folder path (auto-deduced).
     116             :  * @param folder_path Path of the folder containing the files.
     117             :  * @param Ndata Maximum number of data points to read from each file (default is -1, read all data).
     118             :  * @param verbose Verbosity level for logging output (default is 1).
     119             :  * @param start_row Starting row index for reading the data (default is 0).
     120             :  * @param start_col Starting column index for reading the data (default is 0).
     121             :  * @param delimiter Delimiter character used in the files (default is ',').
     122             :  * @return std::pair<std::vector<std::vector<data_t>>, std::vector<std::string>> A pair containing vectors of data and corresponding file names.
     123             :  */
     124             : template <typename data_t, typename Tpath>
     125          17 : auto load_folder(Tpath &folder_path, int Ndata = -1, int verbose = 1, int start_row = 0, int start_col = 0, char delimiter = ',')
     126             : {
     127          17 :   std::cout << "Reading data:" << std::endl;
     128             : 
     129          17 :   std::vector<std::vector<data_t>> p_vec;
     130          17 :   std::vector<std::string> p_names;
     131             : 
     132          17 :   int i_data = 0;
     133         975 :   for (const auto &entry : fs::directory_iterator(folder_path)) {
     134             : 
     135         462 :     auto p = readFile<data_t>(entry.path(), start_row, start_col, delimiter);
     136             : 
     137         462 :     if (verbose >= 2 || (verbose == 1 && p.empty()))
     138           0 :       std::cout << entry.path() << "\tSize: " << p.size() << '\n';
     139             : 
     140         462 :     p_vec.push_back(std::move(p));
     141         462 :     p_names.push_back(entry.path().stem().string());
     142             : 
     143         462 :     i_data++;
     144         462 :     if (i_data == Ndata) break;
     145             :   }
     146             : 
     147          17 :   std::cout << p_vec.size() << " time-series data are read.\n";
     148             : 
     149          34 :   return std::pair(p_vec, p_names);
     150          17 : }
     151             : 
     152             : /**
     153             :  * @brief Loads batch data from a single file and returns the data as vectors.
     154             :  *
     155             :  * @tparam data_t The data type of the elements to be read.
     156             :  * @param file_path Path of the file containing batch data.
     157             :  * @param Ndata Maximum number of data points to read (default is -1, read all data).
     158             :  * @param verbose Verbosity level for logging output (default is 1).
     159             :  * @param start_row Starting row index for reading the data (default is 0).
     160             :  * @param start_col Starting column index for reading the data (default is 0).
     161             :  * @param delimiter Delimiter character used in the file (default is ',').
     162             :  * @return std::pair<std::vector<std::vector<data_t>>, std::vectorstd::string> A pair containing vectors of data and corresponding identifiers.
     163             :  */
     164             : template <typename data_t>
     165          32 : auto load_batch_file(fs::path &file_path, int Ndata = -1, int verbose = 1, int start_row = 0, int start_col = 0, char delimiter = ',')
     166             : {
     167          32 :   std::cout << "Reading data:" << std::endl;
     168             : 
     169          32 :   std::vector<std::vector<data_t>> p_vec;
     170          32 :   std::vector<std::string> p_names;
     171             : 
     172          32 :   auto myAbsPath = fs::absolute(file_path);
     173             : 
     174          32 :   std::ifstream in(file_path, std::ios_base::in);
     175          32 :   if (!in.good()) // check if we could open the file
     176             :   {
     177           0 :     std::cerr << "Error in readFile. File " << file_path << " could not be opened.\n";
     178           0 :     throw 2;
     179             :   }
     180             : 
     181          32 :   std::string line;
     182          32 :   int n_rows{ 0 };
     183       16240 :   while ((Ndata == -1 || n_rows < Ndata) && std::getline(in, line)) //!< Read file.
     184             :   {
     185        8104 :     if (n_rows < start_row) // Skip first rows.
     186           0 :       continue;
     187             : 
     188        8104 :     n_rows++;
     189             : 
     190        8104 :     std::vector<data_t> p;
     191        8104 :     p.reserve(10000);
     192        8104 :     std::istringstream in_line(line);
     193             :     data_t temp, p_i;
     194             :     char c;
     195             : 
     196        8104 :     for (int i = 0; i < start_col; i++) // Skip first start_col columns to start from start_col.
     197             :     {
     198           0 :       in_line >> temp;
     199           0 :       if (delimiter != ' ' && delimiter != '\t') // These we do not need to remove from stream.
     200           0 :         in_line >> c;
     201             :     }
     202             : 
     203     1032347 :     while (in_line >> p_i) {
     204     1024243 :       p.push_back(p_i);
     205     1024243 :       if (delimiter != ' ' && delimiter != '\t') // These we do not need to remove from stream.
     206      525955 :         in_line >> c;
     207             :     }
     208             : 
     209        8104 :     p.shrink_to_fit();
     210             : 
     211        8104 :     if (verbose >= 2 || (verbose == 1 && p.empty()))
     212           0 :       std::cout << file_path << '\t' << "data: " << n_rows << " Size: " << p.size() << '\n';
     213             : 
     214        8104 :     p_vec.push_back(std::move(p));
     215        8104 :     p_names.push_back(std::to_string(n_rows));
     216             :   }
     217             : 
     218          32 :   std::cout << p_vec.size() << " time-series data are read.\n";
     219             : 
     220          64 :   return std::pair(p_vec, p_names);
     221          32 : }
     222             : 
     223             : /**
     224             :  * @brief Writes an Armadillo matrix to a file in CSV format.
     225             :  * @tparam data_t The data type of the elements in the matrix.
     226             :  * @param matrix The Armadillo matrix to be written to file.
     227             :  * @param path Path of the file where the matrix will be saved.
     228             :  */
     229             : template <typename data_t>
     230          65 : void writeMatrix(const arma::Mat<data_t> &matrix, const fs::path &path)
     231             : {
     232          65 :   matrix.save(path.string(), arma::csv_ascii);
     233          65 : }
     234             : 
     235             : /**
     236             :  * @brief Reads a CSV file into an Armadillo matrix.
     237             :  * @tparam data_t The data type of the elements in the matrix.
     238             :  * @param matrix Reference to an Armadillo matrix where the data will be loaded.
     239             :  * @param name Path of the CSV file to read.
     240             :  */
     241             : template <typename data_t>
     242          65 : void readMatrix(arma::Mat<data_t> &matrix, const fs::path &name)
     243             : {
     244          65 :   matrix.load(name.string(), arma::csv_ascii);
     245          65 : }
     246             : 
     247             : } // namespace dtwc

Generated by: LCOV version 1.14