mlpack 3.4.2
tf_idf_encoding_policy.hpp
Go to the documentation of this file.
1
13#ifndef MLPACK_CORE_DATA_STRING_ENCODING_POLICIES_TF_IDF_ENCODING_POLICY_HPP
14#define MLPACK_CORE_DATA_STRING_ENCODING_POLICIES_TF_IDF_ENCODING_POLICY_HPP
15
16#include <mlpack/prereqs.hpp>
19
20namespace mlpack {
21namespace data {
22
36{
37 public:
53 enum class TfTypes
54 {
55 BINARY,
59 };
60
76 const bool smoothIdf = true) :
77 tfType(tfType),
78 smoothIdf(smoothIdf)
79 { }
80
84 void Reset()
85 {
86 tokensFrequences.clear();
87 numContainingStrings.clear();
88 linesSizes.clear();
89 }
90
103 template<typename MatType>
104 static void InitMatrix(MatType& output,
105 const size_t datasetSize,
106 const size_t /* maxNumTokens */,
107 const size_t dictionarySize)
108 {
109 output.zeros(dictionarySize, datasetSize);
110 }
111
126 template<typename ElemType>
127 static void InitMatrix(std::vector<std::vector<ElemType>>& output,
128 const size_t datasetSize,
129 const size_t /* maxNumTokens */,
130 const size_t dictionarySize)
131 {
132 output.resize(datasetSize, std::vector<ElemType>(dictionarySize));
133 }
134
147 template<typename MatType>
148 void Encode(MatType& output,
149 const size_t value,
150 const size_t line,
151 const size_t /* index */)
152 {
153 const typename MatType::elem_type tf =
154 TermFrequency<typename MatType::elem_type>(
155 tokensFrequences[line][value], linesSizes[line]);
156
157 const typename MatType::elem_type idf =
158 InverseDocumentFrequency<typename MatType::elem_type>(
159 output.n_cols, numContainingStrings[value]);
160
161 output(value - 1, line) = tf * idf;
162 }
163
179 template<typename ElemType>
180 void Encode(std::vector<std::vector<ElemType>>& output,
181 const size_t value,
182 const size_t line,
183 const size_t /* index */)
184 {
185 const ElemType tf = TermFrequency<ElemType>(
186 tokensFrequences[line][value], linesSizes[line]);
187
188 const ElemType idf = InverseDocumentFrequency<ElemType>(
189 output.size(), numContainingStrings[value]);
190
191 output[line][value - 1] = tf * idf;
192 }
193
194 /*
195 * The function calculates the necessary statistics for the purpose
196 * of the tf-idf algorithm during the first pass through the dataset.
197 *
198 * @param line The line number at which the encoding is performed.
199 * @param index The token sequence number in the line.
200 * @param value The encoded token.
201 */
202 void PreprocessToken(const size_t line,
203 const size_t /* index */,
204 const size_t value)
205 {
206 if (line >= tokensFrequences.size())
207 {
208 linesSizes.resize(line + 1);
209 tokensFrequences.resize(line + 1);
210 }
211
212 tokensFrequences[line][value]++;
213
214 if (tokensFrequences[line][value] == 1)
215 numContainingStrings[value]++;
216
217 linesSizes[line]++;
218 }
219
221 const std::vector<std::unordered_map<size_t, size_t>>&
222 TokensFrequences() const { return tokensFrequences; }
224 std::vector<std::unordered_map<size_t, size_t>>& TokensFrequences()
225 {
226 return tokensFrequences;
227 }
228
230 const std::unordered_map<size_t, size_t>& NumContainingStrings() const
231 {
232 return numContainingStrings;
233 }
234
236 std::unordered_map<size_t, size_t>& NumContainingStrings()
237 {
238 return numContainingStrings;
239 }
240
242 const std::vector<size_t>& LinesSizes() const { return linesSizes; }
244 std::vector<size_t>& LinesSizes() { return linesSizes; }
245
247 TfTypes TfType() const { return tfType; }
249 TfTypes& TfType() { return tfType; }
250
252 bool SmoothIdf() const { return smoothIdf; }
254 bool& SmoothIdf() { return smoothIdf; }
255
259 template<typename Archive>
260 void serialize(Archive& ar, const unsigned int /* version */)
261 {
262 ar & BOOST_SERIALIZATION_NVP(tfType);
263 ar & BOOST_SERIALIZATION_NVP(smoothIdf);
264 }
265
266 private:
276 template<typename ValueType>
277 ValueType TermFrequency(const size_t numOccurrences,
278 const size_t numTokens)
279 {
280 switch (tfType)
281 {
282 case TfTypes::BINARY:
283 return numOccurrences > 0;
285 return numOccurrences;
287 return static_cast<ValueType>(numOccurrences) / numTokens;
289 return std::log(static_cast<ValueType>(numOccurrences)) + 1;
290 default:
291 Log::Fatal << "Incorrect term frequency type!";
292 return 0;
293 }
294 }
295
305 template<typename ValueType>
306 ValueType InverseDocumentFrequency(const size_t totalNumLines,
307 const size_t numOccurrences)
308 {
309 if (smoothIdf)
310 {
311 return std::log(static_cast<ValueType>(totalNumLines + 1) /
312 (1 + numOccurrences)) + 1.0;
313 }
314 else
315 {
316 return std::log(static_cast<ValueType>(totalNumLines) /
317 numOccurrences) + 1.0;
318 }
319 }
320
321 private:
323 std::vector<std::unordered_map<size_t, size_t>> tokensFrequences;
328 std::unordered_map<size_t, size_t> numContainingStrings;
330 std::vector<size_t> linesSizes;
332 TfTypes tfType;
334 bool smoothIdf;
335};
336
343template<typename TokenType>
346} // namespace data
347} // namespace mlpack
348
349#endif
static MLPACK_EXPORT util::PrefixedOutStream Fatal
Prints fatal messages prefixed with [FATAL], then terminates the program.
Definition: log.hpp:90
This class provides a dictionary interface for the purpose of string encoding.
The class translates a set of strings into numbers using various encoding algorithms.
Definition of the TfIdfEncodingPolicy class.
void Encode(MatType &output, const size_t value, const size_t line, const size_t)
The function performs the TfIdf encoding algorithm i.e.
static void InitMatrix(std::vector< std::vector< ElemType > > &output, const size_t datasetSize, const size_t, const size_t dictionarySize)
The function initializes the output matrix.
std::vector< size_t > & LinesSizes()
Modify the lines sizes.
void PreprocessToken(const size_t line, const size_t, const size_t value)
void Reset()
Clear the necessary internal variables.
static void InitMatrix(MatType &output, const size_t datasetSize, const size_t, const size_t dictionarySize)
The function initializes the output matrix.
void Encode(std::vector< std::vector< ElemType > > &output, const size_t value, const size_t line, const size_t)
The function performs the TfIdf encoding algorithm i.e.
const std::unordered_map< size_t, size_t > & NumContainingStrings() const
Get the number of containing strings depending on the given token.
std::vector< std::unordered_map< size_t, size_t > > & TokensFrequences()
Modify token frequencies.
const std::vector< size_t > & LinesSizes() const
Return the lines sizes.
bool SmoothIdf() const
Determine the idf algorithm type (whether it's smooth or not).
TfTypes TfType() const
Return the term frequency type.
const std::vector< std::unordered_map< size_t, size_t > > & TokensFrequences() const
Return token frequencies.
TfTypes
Enum class used to identify the type of the term frequency statistics.
TfTypes & TfType()
Modify the term frequency type.
bool & SmoothIdf()
Modify the idf algorithm type (whether it's smooth or not).
void serialize(Archive &ar, const unsigned int)
Serialize the class to the given archive.
TfIdfEncodingPolicy(const TfTypes tfType=TfTypes::RAW_COUNT, const bool smoothIdf=true)
Construct this using the term frequency type and the inverse document frequency type.
std::unordered_map< size_t, size_t > & NumContainingStrings()
Modify the number of containing strings depending on the given token.
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
The core includes that mlpack expects; standard C++ includes and Armadillo.