Line data Source code
1 : /**
2 : * @file fileOperations.hpp
3 : * @brief Functions for file operations
4 : *
5 : * @details This header file declares various functions for performing file operations such as
6 : * reading and writing data to/from files. It includes functions to handle comma-separated values (CSV) files,
7 : * read data into vectors or Armadillo matrices, and save matrices to files.
8 : * It provides the functionality to ignore Byte Order Marks (BOM) in text files,
9 : * read specific rows and columns from files, and handle data from directories or batch files.
10 : *
11 : * @date 21 Jan 2022
12 : * @author Volkan Kumtepeli
13 : * @author Becky Perriment
14 : */
15 :
16 : #pragma once
17 :
18 : #include "settings.hpp" // for resultsPath
19 :
20 : #include <cassert> // for assert
21 : #include <chrono> // for filesystem
22 : #include <cstdlib> // for size_t
23 : #include <filesystem> // for operator<<, path, operator/, directory_iterator
24 : #include <iostream> // for operator<<, ifstream, basic_ostream, operator>>
25 : #include <string> // for string, getline, to_string
26 : #include <utility> // for pair
27 : #include <vector> // for vector
28 : #include <fstream>
29 : #include <string>
30 : #include <sstream>
31 : #include <stdexcept> // for std::runtime_error
32 :
33 : #include <armadillo>
34 :
35 : namespace dtwc {
36 :
37 : namespace fs = std::filesystem;
38 :
39 : /**
40 : * @brief Ignores Byte Order Mark (BOM) in UTF-8 encoded files.
41 : *
42 : * @param in Reference to the input stream to process.
43 : */
44 466 : inline void ignoreBOM(std::istream &in)
45 : {
46 466 : char BOMchars[] = { '\xEF', '\xBB', '\xBF' };
47 466 : int seek = 0;
48 466 : char c = '.';
49 469 : while (in >> c) {
50 362 : if (BOMchars[seek] != c) {
51 359 : in.putback(c);
52 359 : break;
53 : }
54 3 : seek++;
55 : }
56 466 : in.clear(); // Clear EOF flag if end of file was reached
57 466 : }
58 :
59 : /**
60 : * @brief Reads a file and returns the data as a vector of a specified type.
61 : *
62 : * @tparam data_t The data type of the elements to be read.
63 : * @param name Path of the file to read.
64 : * @param start_row Starting row index for reading the data (default is 0).
65 : * @param start_col Starting column index for reading the data (default is 0).
66 : * @param delimiter Delimiter character used in the file (default is ',').
67 : * @return std::vector<data_t> A vector containing the read data.
68 : */
69 : template <typename data_t>
70 462 : auto readFile(const fs::path &name, int start_row = 0, int start_col = 0, char delimiter = ',')
71 : {
72 462 : std::ifstream in(name, std::ios_base::in);
73 462 : if (!in.good()) // check if we could open the file
74 : {
75 0 : std::cerr << "Error in readFile. File " << name << " could not be opened.\n";
76 0 : std::runtime_error("");
77 : }
78 :
79 462 : ignoreBOM(in);
80 :
81 : // https://stackoverflow.com/questions/70497719/read-from-comma-separated-file-into-vector-of-objects
82 462 : std::string line{};
83 462 : char c = '.';
84 :
85 472 : for (int i = 0; i < start_row; i++) // Skip first start_row rows to start from start_row.
86 10 : std::getline(in, line);
87 :
88 :
89 : data_t temp, p_i;
90 462 : std::vector<data_t> p;
91 462 : p.reserve(10000);
92 :
93 237294 : while (std::getline(in, line)) {
94 118416 : std::istringstream iss(line);
95 :
96 236832 : for (int i = 0; i < start_col; i++) // Skip first start_col columns to start from start_col.
97 : {
98 118416 : iss >> temp;
99 118416 : if (delimiter != ' ' && delimiter != '\t') // These we do not need to remove from stream.
100 118416 : iss >> c;
101 : }
102 :
103 118416 : iss >> p_i; // Finally we got it! % #TODO does not work for many-column arrays.
104 118416 : p.push_back(p_i);
105 : }
106 :
107 462 : p.shrink_to_fit();
108 924 : return p;
109 462 : }
110 :
111 : /**
112 : * @brief Loads all files from a given folder and returns their data as vectors along with file names.
113 : *
114 : * @tparam data_t The data type of the elements to be read.
115 : * @tparam Tpath Type of the folder path (auto-deduced).
116 : * @param folder_path Path of the folder containing the files.
117 : * @param Ndata Maximum number of data points to read from each file (default is -1, read all data).
118 : * @param verbose Verbosity level for logging output (default is 1).
119 : * @param start_row Starting row index for reading the data (default is 0).
120 : * @param start_col Starting column index for reading the data (default is 0).
121 : * @param delimiter Delimiter character used in the files (default is ',').
122 : * @return std::pair<std::vector<std::vector<data_t>>, std::vector<std::string>> A pair containing vectors of data and corresponding file names.
123 : */
124 : template <typename data_t, typename Tpath>
125 17 : auto load_folder(Tpath &folder_path, int Ndata = -1, int verbose = 1, int start_row = 0, int start_col = 0, char delimiter = ',')
126 : {
127 17 : std::cout << "Reading data:" << std::endl;
128 :
129 17 : std::vector<std::vector<data_t>> p_vec;
130 17 : std::vector<std::string> p_names;
131 :
132 17 : int i_data = 0;
133 975 : for (const auto &entry : fs::directory_iterator(folder_path)) {
134 :
135 462 : auto p = readFile<data_t>(entry.path(), start_row, start_col, delimiter);
136 :
137 462 : if (verbose >= 2 || (verbose == 1 && p.empty()))
138 0 : std::cout << entry.path() << "\tSize: " << p.size() << '\n';
139 :
140 462 : p_vec.push_back(std::move(p));
141 462 : p_names.push_back(entry.path().stem().string());
142 :
143 462 : i_data++;
144 462 : if (i_data == Ndata) break;
145 : }
146 :
147 17 : std::cout << p_vec.size() << " time-series data are read.\n";
148 :
149 34 : return std::pair(p_vec, p_names);
150 17 : }
151 :
152 : /**
153 : * @brief Loads batch data from a single file and returns the data as vectors.
154 : *
155 : * @tparam data_t The data type of the elements to be read.
156 : * @param file_path Path of the file containing batch data.
157 : * @param Ndata Maximum number of data points to read (default is -1, read all data).
158 : * @param verbose Verbosity level for logging output (default is 1).
159 : * @param start_row Starting row index for reading the data (default is 0).
160 : * @param start_col Starting column index for reading the data (default is 0).
161 : * @param delimiter Delimiter character used in the file (default is ',').
162 : * @return std::pair<std::vector<std::vector<data_t>>, std::vectorstd::string> A pair containing vectors of data and corresponding identifiers.
163 : */
164 : template <typename data_t>
165 32 : auto load_batch_file(fs::path &file_path, int Ndata = -1, int verbose = 1, int start_row = 0, int start_col = 0, char delimiter = ',')
166 : {
167 32 : std::cout << "Reading data:" << std::endl;
168 :
169 32 : std::vector<std::vector<data_t>> p_vec;
170 32 : std::vector<std::string> p_names;
171 :
172 32 : auto myAbsPath = fs::absolute(file_path);
173 :
174 32 : std::ifstream in(file_path, std::ios_base::in);
175 32 : if (!in.good()) // check if we could open the file
176 : {
177 0 : std::cerr << "Error in readFile. File " << file_path << " could not be opened.\n";
178 0 : throw 2;
179 : }
180 :
181 32 : std::string line;
182 32 : int n_rows{ 0 };
183 16240 : while ((Ndata == -1 || n_rows < Ndata) && std::getline(in, line)) //!< Read file.
184 : {
185 8104 : if (n_rows < start_row) // Skip first rows.
186 0 : continue;
187 :
188 8104 : n_rows++;
189 :
190 8104 : std::vector<data_t> p;
191 8104 : p.reserve(10000);
192 8104 : std::istringstream in_line(line);
193 : data_t temp, p_i;
194 : char c;
195 :
196 8104 : for (int i = 0; i < start_col; i++) // Skip first start_col columns to start from start_col.
197 : {
198 0 : in_line >> temp;
199 0 : if (delimiter != ' ' && delimiter != '\t') // These we do not need to remove from stream.
200 0 : in_line >> c;
201 : }
202 :
203 1032347 : while (in_line >> p_i) {
204 1024243 : p.push_back(p_i);
205 1024243 : if (delimiter != ' ' && delimiter != '\t') // These we do not need to remove from stream.
206 525955 : in_line >> c;
207 : }
208 :
209 8104 : p.shrink_to_fit();
210 :
211 8104 : if (verbose >= 2 || (verbose == 1 && p.empty()))
212 0 : std::cout << file_path << '\t' << "data: " << n_rows << " Size: " << p.size() << '\n';
213 :
214 8104 : p_vec.push_back(std::move(p));
215 8104 : p_names.push_back(std::to_string(n_rows));
216 : }
217 :
218 32 : std::cout << p_vec.size() << " time-series data are read.\n";
219 :
220 64 : return std::pair(p_vec, p_names);
221 32 : }
222 :
223 : /**
224 : * @brief Writes an Armadillo matrix to a file in CSV format.
225 : * @tparam data_t The data type of the elements in the matrix.
226 : * @param matrix The Armadillo matrix to be written to file.
227 : * @param path Path of the file where the matrix will be saved.
228 : */
229 : template <typename data_t>
230 65 : void writeMatrix(const arma::Mat<data_t> &matrix, const fs::path &path)
231 : {
232 65 : matrix.save(path.string(), arma::csv_ascii);
233 65 : }
234 :
235 : /**
236 : * @brief Reads a CSV file into an Armadillo matrix.
237 : * @tparam data_t The data type of the elements in the matrix.
238 : * @param matrix Reference to an Armadillo matrix where the data will be loaded.
239 : * @param name Path of the CSV file to read.
240 : */
241 : template <typename data_t>
242 65 : void readMatrix(arma::Mat<data_t> &matrix, const fs::path &name)
243 : {
244 65 : matrix.load(name.string(), arma::csv_ascii);
245 65 : }
246 :
247 : } // namespace dtwc
|