58int main(
int argc,
char* argv[])
69 const std::string TABBED_SEPARATED_VALUES =
"tsv";
70 const std::string SQLITE =
"sqlite";
72 std::string format = TABBED_SEPARATED_VALUES;
75 bool lowercase =
false;
82 const char *
const short_options =
"n:o:f:alhv";
83 const struct option long_options[] =
85 {
"ngrams", required_argument, 0,
'n' },
86 {
"output", required_argument, 0,
'o' },
87 {
"format", required_argument, 0,
'f' },
88 {
"append", no_argument, 0,
'a' },
89 {
"lowercase", no_argument, 0,
'l' },
90 {
"help", no_argument, 0,
'h' },
91 {
"version", no_argument, 0,
'v' },
96 next_option = getopt_long(argc,
102 switch (next_option) {
104 if (atoi(optarg) > 0) {
105 ngrams = atoi(optarg);
115 || optarg == TABBED_SEPARATED_VALUES) {
118 std::cerr <<
"Unknown format " << optarg << std::endl << std::endl;
145 std::cerr <<
"Error: unhandled option." << std::endl;
149 }
while (next_option != -1);
152 if ((argc - optind < 1)) {
159 std::map<NgramList, int> ngramMap;
161 for (
int i = optind; i < argc; i++) {
168 std::cout <<
"Parsing " << argv[i] <<
"..."
174 std::ifstream infile(argv[i]);
177 "`~!@#$%^&*()_-+=\\|]}[{'\";:/?.>,<");
181 for (
int i = 0; (i < ngrams - 1 && tokenizer.
hasMoreTokens()); i++) {
190 ngram.push_back(token);
193 ngramMap[ngram] = ngramMap[ngram] + 1;
207 std::cout <<
"Writing out to " << format <<
" format file "
208 << output <<
"..." << std::endl;
209 if (format == TABBED_SEPARATED_VALUES) {
213 std::ofstream *outstream = 0;
214 std::ostream *prev_outstream = 0;
216 if (output.c_str()) {
218 outstream =
new std::ofstream (output.c_str(), std::ios::out);
220 prev_outstream = std::cout.tie (outstream);
225 long total = ngramMap.size();
227 std::map<NgramList, int>::const_iterator it;
228 for (it = ngramMap.begin(); it != ngramMap.end(); it++) {
229 for (NgramList::const_iterator ngram_it = it->first.begin();
230 ngram_it != it->first.end();
232 std::cout << *ngram_it <<
'\t';
234 std::cout << it->second << std::endl;
235 progressBar.
update(
static_cast<double>(count++)/total);
238 if (output.c_str()) {
239 std::cout.tie (prev_outstream);
244 }
else if (format == SQLITE) {
254 long total = ngramMap.size();
256 std::map<NgramList, int>::const_iterator it;
257 for (it = ngramMap.begin(); it != ngramMap.end(); it++) {
261 for (NgramList::const_iterator jt = it->first.begin();
262 jt != it->first.end();
264 ngram.push_back(*jt);
273 sqliteDbCntr.
updateNgram(ngram, count + it->second);
283 progressBar.
update(
static_cast<double>(count++)/total);
291 std::cout << std::endl;
int main(int argc, char *argv[])
std::list< std::string > NgramList