mlpack 3.4.2
load_csv.hpp
Go to the documentation of this file.
1
12#ifndef MLPACK_CORE_DATA_LOAD_CSV_HPP
13#define MLPACK_CORE_DATA_LOAD_CSV_HPP
14
15#include <boost/spirit/include/qi.hpp>
16#include <boost/algorithm/string/trim.hpp>
17
18#include <mlpack/core.hpp>
20
21#include <set>
22#include <string>
23
24#include "extension.hpp"
25#include "format.hpp"
26#include "dataset_mapper.hpp"
27
28namespace mlpack {
29namespace data {
30
37{
38 public:
43 LoadCSV(const std::string& file);
44
54 template<typename T, typename PolicyType>
55 void Load(arma::Mat<T> &inout,
57 const bool transpose = true)
58 {
59 CheckOpen();
60
61 if (transpose)
62 TransposeParse(inout, infoSet);
63 else
64 NonTransposeParse(inout, infoSet);
65 }
66
77 template<typename T, typename MapPolicy>
78 void GetMatrixSize(size_t& rows, size_t& cols, DatasetMapper<MapPolicy>& info)
79 {
80 using namespace boost::spirit;
81
82 // Take a pass through the file. If the DatasetMapper policy requires it,
83 // we will pass everything string through MapString(). This might be useful
84 // if, e.g., the MapPolicy needs to find which dimensions are numeric or
85 // categorical.
86
87 // Reset to the start of the file.
88 inFile.clear();
89 inFile.seekg(0, std::ios::beg);
90 rows = 0;
91 cols = 0;
92
93 // First, count the number of rows in the file (this is the dimensionality).
94 std::string line;
95 while (std::getline(inFile, line))
96 {
97 ++rows;
98 }
99 info = DatasetMapper<MapPolicy>(rows);
100
101 // Now, jump back to the beginning of the file.
102 inFile.clear();
103 inFile.seekg(0, std::ios::beg);
104 rows = 0;
105
106 while (std::getline(inFile, line))
107 {
108 ++rows;
109 // Remove whitespace from either side.
110 boost::trim(line);
111
112 if (rows == 1)
113 {
114 // Extract the number of columns.
115 auto findColSize = [&cols](iter_type) { ++cols; };
116 qi::parse(line.begin(), line.end(),
117 stringRule[findColSize] % delimiterRule);
118 }
119
120 // I guess this is technically a second pass, but that's ok... still the
121 // same idea...
122 if (MapPolicy::NeedsFirstPass)
123 {
124 // In this case we must pass everything we parse to the MapPolicy.
125 auto firstPassMap = [&](const iter_type& iter)
126 {
127 std::string str(iter.begin(), iter.end());
128 boost::trim(str);
129
130 info.template MapFirstPass<T>(std::move(str), rows - 1);
131 };
132
133 // Now parse the line.
134 qi::parse(line.begin(), line.end(),
135 stringRule[firstPassMap] % delimiterRule);
136 }
137 }
138 }
139
150 template<typename T, typename MapPolicy>
151 void GetTransposeMatrixSize(size_t& rows,
152 size_t& cols,
154 {
155 using namespace boost::spirit;
156
157 // Take a pass through the file. If the DatasetMapper policy requires it,
158 // we will pass everything string through MapString(). This might be useful
159 // if, e.g., the MapPolicy needs to find which dimensions are numeric or
160 // categorical.
161
162 // Reset to the start of the file.
163 inFile.clear();
164 inFile.seekg(0, std::ios::beg);
165 rows = 0;
166 cols = 0;
167
168 std::string line;
169 while (std::getline(inFile, line))
170 {
171 ++cols;
172 // Remove whitespace from either side.
173 boost::trim(line);
174
175 if (cols == 1)
176 {
177 // Extract the number of dimensions.
178 auto findRowSize = [&rows](iter_type) { ++rows; };
179 qi::parse(line.begin(), line.end(),
180 stringRule[findRowSize] % delimiterRule);
181
182 // Now that we know the dimensionality, initialize the DatasetMapper.
183 info.SetDimensionality(rows);
184 }
185
186 // If we need to do a first pass for the DatasetMapper, do it.
187 if (MapPolicy::NeedsFirstPass)
188 {
189 size_t dim = 0;
190
191 // In this case we must pass everything we parse to the MapPolicy.
192 auto firstPassMap = [&](const iter_type& iter)
193 {
194 std::string str(iter.begin(), iter.end());
195 boost::trim(str);
196
197 info.template MapFirstPass<T>(std::move(str), dim++);
198 };
199
200 // Now parse the line.
201 qi::parse(line.begin(), line.end(),
202 stringRule[firstPassMap] % delimiterRule);
203 }
204 }
205 }
206
207 private:
208 using iter_type = boost::iterator_range<std::string::iterator>;
209
214 void CheckOpen();
215
222 template<typename T, typename PolicyType>
223 void NonTransposeParse(arma::Mat<T>& inout,
225 {
226 using namespace boost::spirit;
227
228 // Get the size of the matrix.
229 size_t rows, cols;
230 GetMatrixSize<T>(rows, cols, infoSet);
231
232 // Set up output matrix.
233 inout.set_size(rows, cols);
234 size_t row = 0;
235 size_t col = 0;
236
237 // Reset file position.
238 std::string line;
239 inFile.clear();
240 inFile.seekg(0, std::ios::beg);
241
242 auto setCharClass = [&](iter_type const &iter)
243 {
244 std::string str(iter.begin(), iter.end());
245 if (str == "\t")
246 {
247 str.clear();
248 }
249 boost::trim(str);
250
251 inout(row, col++) = infoSet.template MapString<T>(std::move(str), row);
252 };
253
254 while (std::getline(inFile, line))
255 {
256 // Remove whitespace from either side.
257 boost::trim(line);
258
259 // Parse the numbers from a line (ex: 1,2,3,4); if the parser finds a
260 // number it will execute the setNum function.
261 const bool canParse = qi::parse(line.begin(), line.end(),
262 stringRule[setCharClass] % delimiterRule);
263
264 // Make sure we got the right number of rows.
265 if (col != cols)
266 {
267 std::ostringstream oss;
268 oss << "LoadCSV::NonTransposeParse(): wrong number of dimensions ("
269 << col << ") on line " << row << "; should be " << cols
270 << " dimensions.";
271 throw std::runtime_error(oss.str());
272 }
273
274 if (!canParse)
275 {
276 std::ostringstream oss;
277 oss << "LoadCSV::NonTransposeParse(): parsing error on line " << col
278 << "!";
279 throw std::runtime_error(oss.str());
280 }
281
282 ++row; col = 0;
283 }
284 }
285
292 template<typename T, typename PolicyType>
293 void TransposeParse(arma::Mat<T>& inout, DatasetMapper<PolicyType>& infoSet)
294 {
295 using namespace boost::spirit;
296
297 // Get matrix size. This also initializes infoSet correctly.
298 size_t rows, cols;
299 GetTransposeMatrixSize<T>(rows, cols, infoSet);
300
301 // Set the matrix size.
302 inout.set_size(rows, cols);
303
304 // Initialize auxiliary variables.
305 size_t row = 0;
306 size_t col = 0;
307 std::string line;
308 inFile.clear();
309 inFile.seekg(0, std::ios::beg);
310
315 auto parseString = [&](iter_type const &iter)
316 {
317 // All parsed values must be mapped.
318 std::string str(iter.begin(), iter.end());
319 boost::trim(str);
320
321 inout(row, col) = infoSet.template MapString<T>(std::move(str), row);
322 ++row;
323 };
324
325 while (std::getline(inFile, line))
326 {
327 // Remove whitespace from either side.
328 boost::trim(line);
329
330 // Reset the row we are looking at. (Remember this is transposed.)
331 row = 0;
332
333 // Now use boost::spirit to parse the characters of the line;
334 // parseString() will be called when a token is detected.
335 const bool canParse = qi::parse(line.begin(), line.end(),
336 stringRule[parseString] % delimiterRule);
337
338 // Make sure we got the right number of rows.
339 if (row != rows)
340 {
341 std::ostringstream oss;
342 oss << "LoadCSV::TransposeParse(): wrong number of dimensions (" << row
343 << ") on line " << col << "; should be " << rows << " dimensions.";
344 throw std::runtime_error(oss.str());
345 }
346
347 if (!canParse)
348 {
349 std::ostringstream oss;
350 oss << "LoadCSV::TransposeParse(): parsing error on line " << col
351 << "!";
352 throw std::runtime_error(oss.str());
353 }
354
355 // Increment the column index.
356 ++col;
357 }
358 }
359
361 boost::spirit::qi::rule<std::string::iterator, iter_type()> stringRule;
363 boost::spirit::qi::rule<std::string::iterator, iter_type()> delimiterRule;
364
366 std::string extension;
368 std::string filename;
370 std::ifstream inFile;
371};
372
373} // namespace data
374} // namespace mlpack
375
376#endif
Auxiliary information for a dataset, including mappings to/from strings (or other types) and the data...
void SetDimensionality(const size_t dimensionality)
Set the dimensionality of an existing DatasetMapper object.
Load the csv file.This class use boost::spirit to implement the parser, please refer to following lin...
Definition: load_csv.hpp:37
LoadCSV(const std::string &file)
Construct the LoadCSV object on the given file.
void GetTransposeMatrixSize(size_t &rows, size_t &cols, DatasetMapper< MapPolicy > &info)
Peek at the file to determine the number of rows and columns in the matrix, assuming a transposed mat...
Definition: load_csv.hpp:151
void GetMatrixSize(size_t &rows, size_t &cols, DatasetMapper< MapPolicy > &info)
Peek at the file to determine the number of rows and columns in the matrix, assuming a non-transposed...
Definition: load_csv.hpp:78
void Load(arma::Mat< T > &inout, DatasetMapper< PolicyType > &infoSet, const bool transpose=true)
Load the file into the given matrix with the given DatasetMapper object.
Definition: load_csv.hpp:55
Include all of the base components required to write mlpack methods, and the main mlpack Doxygen docu...
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1