blocxx
PerlRegEx.cpp
Go to the documentation of this file.
1/*******************************************************************************
2* Copyright (C) 2005 Novell, Inc. All rights reserved.
3*
4* Redistribution and use in source and binary forms, with or without
5* modification, are permitted provided that the following conditions are met:
6*
7* - Redistributions of source code must retain the above copyright notice,
8* this list of conditions and the following disclaimer.
9*
10* - Redistributions in binary form must reproduce the above copyright notice,
11* this list of conditions and the following disclaimer in the documentation
12* and/or other materials provided with the distribution.
13*
14* - Neither the name of Vintela, Inc., Novell, Inc., nor the names of its
15* contributors may be used to endorse or promote products derived from this
16* software without specific prior written permission.
17*
18* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
19* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21* ARE DISCLAIMED. IN NO EVENT SHALL Vintela, Inc., Novell, Inc., OR THE
22* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*******************************************************************************/
34#include "blocxx/PerlRegEx.hpp"
35
36#ifdef BLOCXX_HAVE_PCRE
37#ifdef BLOCXX_HAVE_PCRE_H
38
40#include "blocxx/Assertion.hpp"
41#include "blocxx/Format.hpp"
42#include <climits> // for INT_MAX
43
44
45namespace BLOCXX_NAMESPACE
46{
47
48
49// -------------------------------------------------------------------
50static String
51substitute_caps(const PerlRegEx::MatchArray &sub,
52 const String &str, const String &rep)
53{
54 static const char *cap_refs[] = {
55 NULL, "\\1", "\\2", "\\3", "\\4",
56 "\\5", "\\6", "\\7", "\\8", "\\9", NULL
57 };
58
59 String res( rep);
60 size_t pos;
61
62 for(size_t i=1; cap_refs[i] != NULL; i++)
63 {
64 String cap;
65 if( i < sub.size() && sub[i].rm_so >= 0 && sub[i].rm_eo >= 0)
66 {
67 cap = str.substring(sub[i].rm_so, sub[i].rm_eo
68 - sub[i].rm_so);
69 }
70
71 pos = res.indexOf(cap_refs[i]);
72 while( pos != String::npos)
73 {
74 size_t quotes = 0;
75 size_t at = pos;
76
77 while( at > 0 && res.charAt(--at) == '\\')
78 quotes++;
79
80 if( quotes % 2)
81 {
82 quotes = (quotes + 1) / 2;
83
84 res = res.erase(pos - quotes, quotes);
85
86 pos = res.indexOf(cap_refs[i],
87 pos + 2 - quotes);
88 }
89 else
90 {
91 quotes = quotes / 2;
92
93 res = res.substring(0, pos - quotes) +
94 cap +
95 res.substring(pos + 2);
96
97 pos = res.indexOf(cap_refs[i],
98 pos + cap.length() - quotes);
99 }
100 }
101 }
102 return res;
103}
104
105
106// -------------------------------------------------------------------
107static inline String
108getError(const int errcode)
109{
110 const char *ptr;
111 switch(errcode)
112 {
113 case 0:
114 ptr = "match vector to small";
115 break;
116
117 case PCRE_ERROR_NOMATCH:
118 ptr = "match failed";
119 break;
120
121 case PCRE_ERROR_NULL:
122 ptr = "invalid argument";
123 break;
124
125 case PCRE_ERROR_BADOPTION:
126 ptr = "unrecognized option";
127 break;
128
129 case PCRE_ERROR_BADMAGIC:
130 ptr = "invalid magic number";
131 break;
132
133 case PCRE_ERROR_UNKNOWN_NODE:
134 ptr = "unknown item in the compiled pattern";
135 break;
136
137 case PCRE_ERROR_NOMEMORY:
138 ptr = "failed to allocate memory";
139 break;
140
141 case PCRE_ERROR_NOSUBSTRING:
142 // .*_substring.* functions only
143 ptr = "failed to retrieve substring";
144 break;
145
146 case PCRE_ERROR_MATCHLIMIT:
147 // match_limit in pcre_extra struct
148 ptr = "recursion or backtracking limit reached";
149 break;
150
151 case PCRE_ERROR_CALLOUT:
152 // reserved for pcrecallout functions
153 ptr = "callout failure";
154 break;
155
156 case PCRE_ERROR_BADUTF8:
157 ptr = "invalid UTF-8 byte sequence found";
158 break;
159
160 case PCRE_ERROR_BADUTF8_OFFSET:
161 ptr = "not a UTF-8 character at specified index";
162 break;
163
164 case PCRE_ERROR_PARTIAL:
165 ptr = "partial match";
166 break;
167
168 case PCRE_ERROR_BADPARTIAL:
169 ptr = "pattern item not supported for partial matching";
170 break;
171
172 case PCRE_ERROR_INTERNAL:
173 ptr = "unexpected internal error occurred";
174 break;
175
176 case PCRE_ERROR_BADCOUNT:
177 ptr = "invalid (negative) match vector count";
178 break;
179
180 default:
181 ptr = "unknown error code";
182 break;
183 }
184 return String(ptr);
185}
186
187// -------------------------------------------------------------------
188PerlRegEx::PerlRegEx()
189 : m_pcre(NULL)
190 , m_flags(0)
191 , m_ecode(0)
192{
193}
194
195
196// -------------------------------------------------------------------
197PerlRegEx::PerlRegEx(const String &regex, int cflags)
198 : m_pcre(NULL)
199 , m_flags(0)
200 , m_ecode(0)
201{
202 if( !compile(regex, cflags))
203 {
204 BLOCXX_THROW_ERR(RegExCompileException,
205 errorString().c_str(), m_ecode);
206 }
207}
208
209
210// -------------------------------------------------------------------
211PerlRegEx::PerlRegEx(const PerlRegEx &ref)
212 : m_pcre(NULL)
213 , m_flags(ref.m_flags)
214 , m_ecode(0)
215 , m_rxstr(ref.m_rxstr)
216{
217 if( ref.m_pcre != NULL && !compile(ref.m_rxstr, ref.m_flags))
218 {
219 BLOCXX_THROW_ERR(RegExCompileException,
220 errorString().c_str(), m_ecode);
221 }
222}
223
224// -------------------------------------------------------------------
225PerlRegEx::~PerlRegEx()
226{
227 if( m_pcre)
228 {
229 free(m_pcre);
230 m_pcre = NULL;
231 }
232}
233
234
235// -------------------------------------------------------------------
236PerlRegEx &
237PerlRegEx::operator = (const PerlRegEx &ref)
238{
239 if( ref.m_pcre == NULL)
240 {
241 m_ecode = 0;
242 m_error.erase();
243 m_flags = ref.m_flags;
244 m_rxstr = ref.m_rxstr;
245 if( m_pcre != NULL)
246 {
247 free(m_pcre);
248 m_pcre = NULL;
249 }
250 }
251 else if( !compile(ref.m_rxstr, ref.m_flags))
252 {
253 BLOCXX_THROW_ERR(RegExCompileException,
254 errorString().c_str(), m_ecode);
255 }
256 return *this;
257}
258
259
260// -------------------------------------------------------------------
261bool
262PerlRegEx::compile(const String &regex, int cflags)
263{
264 if( m_pcre)
265 {
266 free(m_pcre);
267 m_pcre = NULL;
268 }
269
270 const char *errptr = NULL;
271
272 m_ecode = 0;
273 m_pcre = ::pcre_compile(regex.c_str(), cflags,
274 &errptr, &m_ecode, NULL);
275 if( m_pcre == NULL)
276 {
277 m_error = String(errptr ? errptr : "");
278 m_rxstr.erase();
279 m_flags = 0;
280 return false;
281 }
282 else
283 {
284 m_error.erase();
285 m_rxstr = regex;
286 m_flags = cflags;
287 return true;
288 }
289}
290
291
292// -------------------------------------------------------------------
293int
294PerlRegEx::errorCode()
295{
296 return m_ecode;
297}
298
299
300// -------------------------------------------------------------------
301String
302PerlRegEx::errorString() const
303{
304 return m_error;
305}
306
307
308// -------------------------------------------------------------------
309String
310PerlRegEx::patternString() const
311{
312 return m_rxstr;
313}
314
315
316// -------------------------------------------------------------------
317int
318PerlRegEx::compileFlags() const
319{
320 return m_flags;
321}
322
323
324// -------------------------------------------------------------------
325bool
326PerlRegEx::isCompiled() const
327{
328 return (m_pcre != NULL);
329}
330
331
332// -------------------------------------------------------------------
333bool
334PerlRegEx::execute(MatchArray &sub, const String &str,
335 size_t index, size_t count, int eflags)
336{
337 if( m_pcre == NULL)
338 {
339 BLOCXX_THROW(RegExCompileException,
340 "Regular expression is not compiled");
341 }
342 if( count >= size_t(INT_MAX / 3))
343 {
344 BLOCXX_THROW(AssertionException,
345 "Match count limit exceeded");
346 }
347
348 if( index > str.length())
349 {
350 BLOCXX_THROW(OutOfBoundsException,
351 Format("String index out of bounds ("
352 "length = %1, index = %2).",
353 str.length(), index
354 ).c_str());
355 }
356
357 if( count == 0)
358 {
359 int cnt = 0;
360 int ret = ::pcre_fullinfo(m_pcre, NULL,
361 PCRE_INFO_CAPTURECOUNT, &cnt);
362 if( ret)
363 {
364 m_error = getError(m_ecode);
365 return false;
366 }
367 count = cnt > 0 ? cnt + 1 : 1;
368 }
369 int vsub[count * 3];
370
371 sub.clear();
372 m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(), str.length(),
373 index, eflags, vsub, count * 3);
374 //
375 // pcre_exec returns 0 if vector too small, negative value
376 // on errors or the number of matches (number of int pairs)
377 //
378 if( m_ecode > 0)
379 {
380 sub.resize(count); // as specified by user
381 for(size_t i = 0, n = 0; i < count; i++, n += 2)
382 {
383 match_t m = { vsub[n], vsub[n+1] };
384
385 // if user wants more than detected
386 if( i >= (size_t)m_ecode)
387 m.rm_so = m.rm_eo = -1;
388
389 sub[i] = m;
390 }
391 m_error.erase();
392 return true;
393 }
394 else
395 {
396 m_error = getError(m_ecode);
397 return false;
398 }
399}
400
401
402// -------------------------------------------------------------------
403bool
404PerlRegEx::execute(MatchVector &sub, const String &str,
405 size_t index, size_t count, int eflags)
406{
407 if( m_pcre == NULL)
408 {
409 BLOCXX_THROW(RegExCompileException,
410 "Regular expression is not compiled");
411 }
412 if( count >= size_t(INT_MAX / 3))
413 {
414 BLOCXX_THROW(AssertionException,
415 "Match count limit exceeded");
416 }
417
418 if( index > str.length())
419 {
420 BLOCXX_THROW(OutOfBoundsException,
421 Format("String index out of bounds ("
422 "length = %1, index = %2)",
423 str.length(), index
424 ).c_str());
425 }
426
427 if( count == 0)
428 {
429 int cnt = 0;
430 int ret = ::pcre_fullinfo(m_pcre, NULL,
431 PCRE_INFO_CAPTURECOUNT, &cnt);
432 if( ret)
433 {
434 m_error = getError(m_ecode);
435 return false;
436 }
437 count = cnt > 0 ? cnt + 1 : 1;
438 }
439 int vsub[count * 3];
440
441 sub.clear();
442 m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(), str.length(),
443 index, eflags, vsub, count * 3);
444 //
445 // pcre_exec returns 0 if vector too small, negative value
446 // on errors or the number of matches (number of int pairs)
447 //
448 if( m_ecode > 0)
449 {
450 count *= 2;
451 m_ecode *= 2;
452 sub.resize(count); // as specified by user
453 for(size_t i = 0; i < count; i++)
454 {
455 // if user wants more than detected
456 if( i >= (size_t)m_ecode)
457 vsub[i] = -1;
458
459 sub[i] = vsub[i];
460 }
461 return true;
462 }
463 else
464 {
465 m_error = getError(m_ecode);
466 return false;
467 }
468}
469
470
471// -------------------------------------------------------------------
473PerlRegEx::capture(const String &str, size_t index, size_t count, int eflags)
474{
475 if( m_pcre == NULL)
476 {
477 BLOCXX_THROW(RegExCompileException,
478 "Regular expression is not compiled");
479 }
480
481 MatchArray rsub;
482 StringArray ssub;
483
484 bool match = execute(rsub, str, index, count, eflags);
485 if( match)
486 {
487 if( rsub.empty())
488 {
489 BLOCXX_THROW(RegExCompileException,
490 "Non-capturing regular expression");
491 }
492
493 MatchArray::const_iterator i=rsub.begin();
494 for( ; i != rsub.end(); ++i)
495 {
496 if( i->rm_so >= 0 && i->rm_eo >= 0)
497 {
498 ssub.push_back(str.substring(i->rm_so,
499 i->rm_eo - i->rm_so));
500 }
501 else
502 {
503 ssub.push_back(String(""));
504 }
505 }
506 }
507 else if(m_ecode != PCRE_ERROR_NOMATCH)
508 {
509 BLOCXX_THROW_ERR(RegExExecuteException,
510 errorString().c_str(), m_ecode);
511 }
512 return ssub;
513}
514
515
516// -------------------------------------------------------------------
517blocxx::String
518PerlRegEx::replace(const String &str, const String &rep,
519 bool global, int eflags)
520{
521 if( m_pcre == NULL)
522 {
523 BLOCXX_THROW(RegExCompileException,
524 "Regular expression is not compiled");
525 }
526
527 MatchArray rsub;
528 bool match;
529 size_t off = 0;
530 String out = str;
531
532 do
533 {
534 match = execute(rsub, out, off, 0, eflags);
535 if( match)
536 {
537 if( rsub.empty() ||
538 rsub[0].rm_so < 0 ||
539 rsub[0].rm_eo < 0)
540 {
541 // only if empty (missused as guard).
542 BLOCXX_THROW(RegExCompileException,
543 "Non-capturing regular expression");
544 }
545
546 String res = substitute_caps(rsub, out, rep);
547
548 out = out.substring(0, rsub[0].rm_so) +
549 res + out.substring(rsub[0].rm_eo);
550
551 off = rsub[0].rm_so + res.length();
552 }
553 else if(m_ecode == PCRE_ERROR_NOMATCH)
554 {
555 m_ecode = 0;
556 m_error.erase();
557 }
558 else
559 {
560 BLOCXX_THROW_ERR(RegExExecuteException,
561 errorString().c_str(), m_ecode);
562 }
563 } while(global && match && out.length() > off);
564
565 return out;
566}
567
568
569// -------------------------------------------------------------------
571PerlRegEx::split(const String &str, bool empty, int eflags)
572{
573 if( m_pcre == NULL)
574 {
575 BLOCXX_THROW(RegExCompileException,
576 "Regular expression is not compiled");
577 }
578
579 MatchArray rsub;
580 StringArray ssub;
581 bool match;
582 size_t off = 0;
583 size_t len = str.length();
584
585 do
586 {
587 match = execute(rsub, str, off, 0, eflags);
588 if( match)
589 {
590 if( rsub.empty() ||
591 rsub[0].rm_so < 0 ||
592 rsub[0].rm_eo < 0)
593 {
594 BLOCXX_THROW(RegExCompileException,
595 "Non-capturing regular expression");
596 }
597
598 if( empty || ((size_t)rsub[0].rm_so > off))
599 {
600 ssub.push_back(str.substring(off,
601 rsub[0].rm_so - off));
602 }
603 off = rsub[0].rm_eo;
604 }
605 else if(m_ecode == PCRE_ERROR_NOMATCH)
606 {
607 String tmp = str.substring(off);
608 if( empty || !tmp.empty())
609 {
610 ssub.push_back(tmp);
611 }
612 m_ecode = 0;
613 m_error.erase();
614 }
615 else
616 {
617 BLOCXX_THROW_ERR(RegExExecuteException,
618 errorString().c_str(), m_ecode);
619 }
620 } while(match && len > off);
621
622 return ssub;
623}
624
625
626// -------------------------------------------------------------------
628PerlRegEx::grep(const StringArray &src, int eflags)
629{
630 if( m_pcre == NULL)
631 {
632 BLOCXX_THROW(RegExCompileException,
633 "Regular expression is not compiled");
634 }
635
636 m_ecode = 0;
637 m_error.erase();
638
639 StringArray out;
640 if( !src.empty())
641 {
642 StringArray::const_iterator i=src.begin();
643 for( ; i != src.end(); ++i)
644 {
645 int ret = ::pcre_exec(m_pcre, NULL, i->c_str(),
646 i->length(), 0, eflags, NULL, 0);
647 if( ret >= 0)
648 {
649 out.push_back(*i);
650 }
651 else if( ret != PCRE_ERROR_NOMATCH)
652 {
653 m_ecode = ret;
654 m_error = getError(m_ecode);
655 BLOCXX_THROW_ERR(RegExExecuteException,
656 errorString().c_str(), m_ecode);
657 }
658 }
659 }
660 return out;
661}
662
663
664// -------------------------------------------------------------------
665bool
666PerlRegEx::match(const String &str, size_t index, int eflags) const
667{
668 if( m_pcre == NULL)
669 {
670 BLOCXX_THROW(RegExCompileException,
671 "Regular expression is not compiled");
672 }
673
674 if( index > str.length())
675 {
676 BLOCXX_THROW(OutOfBoundsException,
677 Format("String index out of bounds."
678 "length = %1, index = %2",
679 str.length(), index
680 ).c_str());
681 }
682
683 m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(),
684 str.length(), 0, eflags, NULL, 0);
685 if( m_ecode >= 0)
686 {
687 m_error.erase();
688 return true;
689 }
690 else if( m_ecode == PCRE_ERROR_NOMATCH)
691 {
692 m_error = getError(m_ecode);
693 return false;
694 }
695 else
696 {
697 m_error = getError(m_ecode);
698 BLOCXX_THROW_ERR(RegExExecuteException,
699 errorString().c_str(), m_ecode);
700 }
701}
702
703
704// -------------------------------------------------------------------
705} // namespace BLOCXX_NAMESPACE
706
707#endif // BLOCXX_HAVE_PCRE_H
708#endif // BLOCXX_HAVE_PCRE
709
710/* vim: set ts=8 sts=8 sw=8 ai noet: */
711
#define BLOCXX_THROW(exType, msg)
Throw an exception using FILE and LINE.
#define BLOCXX_THROW_ERR(exType, msg, err)
Throw an exception using FILE and LINE.
iterator erase(iterator position)
Remove an element of the Array specified with an iterator.
static const size_t npos
Definition String.hpp:742
Taken from RFC 1321.
Array< String > StringArray
Definition CommonFwd.hpp:73