blocxx
PerlRegEx.cpp
Go to the documentation of this file.
1/*******************************************************************************
2* Copyright (C) 2005 Novell, Inc. All rights reserved.
3*
4* Redistribution and use in source and binary forms, with or without
5* modification, are permitted provided that the following conditions are met:
6*
7* - Redistributions of source code must retain the above copyright notice,
8* this list of conditions and the following disclaimer.
9*
10* - Redistributions in binary form must reproduce the above copyright notice,
11* this list of conditions and the following disclaimer in the documentation
12* and/or other materials provided with the distribution.
13*
14* - Neither the name of Vintela, Inc., Novell, Inc., nor the names of its
15* contributors may be used to endorse or promote products derived from this
16* software without specific prior written permission.
17*
18* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
19* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21* ARE DISCLAIMED. IN NO EVENT SHALL Vintela, Inc., Novell, Inc., OR THE
22* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*******************************************************************************/
33
34#include "blocxx/PerlRegEx.hpp"
35
36#ifdef BLOCXX_HAVE_PCRE
37#ifdef BLOCXX_HAVE_PCRE_H
38
40#include "blocxx/Assertion.hpp"
41#include "blocxx/Format.hpp"
42#include <climits> // for INT_MAX
43
44
45namespace BLOCXX_NAMESPACE
46{
47
48
49// -------------------------------------------------------------------
50static String
51substitute_caps(const PerlRegEx::MatchArray &sub,
52 const String &str, const String &rep)
53{
54 static const char *cap_refs[] = {
55 NULL, "\\1", "\\2", "\\3", "\\4",
56 "\\5", "\\6", "\\7", "\\8", "\\9", NULL
57 };
58
59 String res( rep);
60 size_t pos;
61
62 for(size_t i=1; cap_refs[i] != NULL; i++)
63 {
64 String cap;
65 if( i < sub.size() && sub[i].rm_so >= 0 && sub[i].rm_eo >= 0)
66 {
67 cap = str.substring(sub[i].rm_so, sub[i].rm_eo
68 - sub[i].rm_so);
69 }
70
71 pos = res.indexOf(cap_refs[i]);
72 while( pos != String::npos)
73 {
74 size_t quotes = 0;
75 size_t at = pos;
76
77 while( at > 0 && res.charAt(--at) == '\\')
78 quotes++;
79
80 if( quotes % 2)
81 {
82 quotes = (quotes + 1) / 2;
83
84 res = res.erase(pos - quotes, quotes);
85
86 pos = res.indexOf(cap_refs[i],
87 pos + 2 - quotes);
88 }
89 else
90 {
91 quotes = quotes / 2;
92
93 res = res.substring(0, pos - quotes) +
94 cap +
95 res.substring(pos + 2);
96
97 pos = res.indexOf(cap_refs[i],
98 pos + cap.length() - quotes);
99 }
100 }
101 }
102 return res;
103}
104
105
106// -------------------------------------------------------------------
107static inline String
108getError(const int errcode)
109{
110 const char *ptr;
111 switch(errcode)
112 {
113 case 0:
114 ptr = "match vector to small";
115 break;
116
117 case PCRE_ERROR_NOMATCH:
118 ptr = "match failed";
119 break;
120
121 case PCRE_ERROR_NULL:
122 ptr = "invalid argument";
123 break;
124
125 case PCRE_ERROR_BADOPTION:
126 ptr = "unrecognized option";
127 break;
128
129 case PCRE_ERROR_BADMAGIC:
130 ptr = "invalid magic number";
131 break;
132
133 case PCRE_ERROR_UNKNOWN_NODE:
134 ptr = "unknown item in the compiled pattern";
135 break;
136
137 case PCRE_ERROR_NOMEMORY:
138 ptr = "failed to allocate memory";
139 break;
140
141 case PCRE_ERROR_NOSUBSTRING:
142 // .*_substring.* functions only
143 ptr = "failed to retrieve substring";
144 break;
145
146 case PCRE_ERROR_MATCHLIMIT:
147 // match_limit in pcre_extra struct
148 ptr = "recursion or backtracking limit reached";
149 break;
150
151 case PCRE_ERROR_CALLOUT:
152 // reserved for pcrecallout functions
153 ptr = "callout failure";
154 break;
155
156 case PCRE_ERROR_BADUTF8:
157 ptr = "invalid UTF-8 byte sequence found";
158 break;
159
160 case PCRE_ERROR_BADUTF8_OFFSET:
161 ptr = "not a UTF-8 character at specified index";
162 break;
163
164 case PCRE_ERROR_PARTIAL:
165 ptr = "partial match";
166 break;
167
168 case PCRE_ERROR_BADPARTIAL:
169 ptr = "pattern item not supported for partial matching";
170 break;
171
172 case PCRE_ERROR_INTERNAL:
173 ptr = "unexpected internal error occurred";
174 break;
175
176 case PCRE_ERROR_BADCOUNT:
177 ptr = "invalid (negative) match vector count";
178 break;
179
180 default:
181 ptr = "unknown error code";
182 break;
183 }
184 return String(ptr);
185}
186
187// -------------------------------------------------------------------
188PerlRegEx::PerlRegEx()
189 : m_pcre(NULL)
190 , m_flags(0)
191 , m_ecode(0)
192{
193}
194
195
196// -------------------------------------------------------------------
197PerlRegEx::PerlRegEx(const String &regex, int cflags)
198 : m_pcre(NULL)
199 , m_flags(0)
200 , m_ecode(0)
201{
202 if( !compile(regex, cflags))
203 {
204 BLOCXX_THROW_ERR(RegExCompileException,
205 errorString().c_str(), m_ecode);
206 }
207}
208
209
210// -------------------------------------------------------------------
211PerlRegEx::PerlRegEx(const PerlRegEx &ref)
212 : m_pcre(NULL)
213 , m_flags(ref.m_flags)
214 , m_ecode(0)
215 , m_rxstr(ref.m_rxstr)
216{
217 if( ref.m_pcre != NULL && !compile(ref.m_rxstr, ref.m_flags))
218 {
219 BLOCXX_THROW_ERR(RegExCompileException,
220 errorString().c_str(), m_ecode);
221 }
222}
223
224// -------------------------------------------------------------------
225PerlRegEx::~PerlRegEx()
226{
227 if( m_pcre)
228 {
229 free(m_pcre);
230 m_pcre = NULL;
231 }
232}
233
234
235// -------------------------------------------------------------------
236PerlRegEx &
237PerlRegEx::operator = (const PerlRegEx &ref)
238{
239 if( ref.m_pcre == NULL)
240 {
241 m_ecode = 0;
242 m_error.erase();
243 m_flags = ref.m_flags;
244 m_rxstr = ref.m_rxstr;
245 if( m_pcre != NULL)
246 {
247 free(m_pcre);
248 m_pcre = NULL;
249 }
250 }
251 else if( !compile(ref.m_rxstr, ref.m_flags))
252 {
253 BLOCXX_THROW_ERR(RegExCompileException,
254 errorString().c_str(), m_ecode);
255 }
256 return *this;
257}
258
259
260// -------------------------------------------------------------------
261bool
262PerlRegEx::compile(const String &regex, int cflags)
263{
264 if( m_pcre)
265 {
266 free(m_pcre);
267 m_pcre = NULL;
268 }
269
270 const char *errptr = NULL;
271
272 m_ecode = 0;
273 m_pcre = ::pcre_compile(regex.c_str(), cflags,
274 &errptr, &m_ecode, NULL);
275 if( m_pcre == NULL)
276 {
277 m_error = String(errptr ? errptr : "");
278 m_rxstr.erase();
279 m_flags = 0;
280 return false;
281 }
282 else
283 {
284 m_error.erase();
285 m_rxstr = regex;
286 m_flags = cflags;
287 return true;
288 }
289}
290
291
292// -------------------------------------------------------------------
293int
294PerlRegEx::errorCode()
295{
296 return m_ecode;
297}
298
299
300// -------------------------------------------------------------------
301String
302PerlRegEx::errorString() const
303{
304 return m_error;
305}
306
307
308// -------------------------------------------------------------------
309String
310PerlRegEx::patternString() const
311{
312 return m_rxstr;
313}
314
315
316// -------------------------------------------------------------------
317int
318PerlRegEx::compileFlags() const
319{
320 return m_flags;
321}
322
323
324// -------------------------------------------------------------------
325bool
326PerlRegEx::isCompiled() const
327{
328 return (m_pcre != NULL);
329}
330
331
332// -------------------------------------------------------------------
333bool
334PerlRegEx::execute(MatchArray &sub, const String &str,
335 size_t index, size_t count, int eflags)
336{
337 if( m_pcre == NULL)
338 {
339 BLOCXX_THROW(RegExCompileException,
340 "Regular expression is not compiled");
341 }
342 if( count >= size_t(INT_MAX / 3))
343 {
344 BLOCXX_THROW(AssertionException,
345 "Match count limit exceeded");
346 }
347
348 if( index > str.length())
349 {
350 BLOCXX_THROW(OutOfBoundsException,
351 Format("String index out of bounds ("
352 "length = %1, index = %2).",
353 str.length(), index
354 ).c_str());
355 }
356
357 if( count == 0)
358 {
359 int cnt = 0;
360 int ret = ::pcre_fullinfo(m_pcre, NULL,
361 PCRE_INFO_CAPTURECOUNT, &cnt);
362 if( ret)
363 {
364 m_error = getError(m_ecode);
365 return false;
366 }
367 count = cnt > 0 ? cnt + 1 : 1;
368 }
369 int vsub[count * 3];
370
371 sub.clear();
372 m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(), str.length(),
373 index, eflags, vsub, count * 3);
374 //
375 // pcre_exec returns 0 if vector too small, negative value
376 // on errors or the number of matches (number of int pairs)
377 //
378 if( m_ecode > 0)
379 {
380 sub.resize(count); // as specified by user
381 for(size_t i = 0, n = 0; i < count; i++, n += 2)
382 {
383 match_t m = { vsub[n], vsub[n+1] };
384
385 // if user wants more than detected
386 if( i >= (size_t)m_ecode)
387 m.rm_so = m.rm_eo = -1;
388
389 sub[i] = m;
390 }
391 m_error.erase();
392 return true;
393 }
394 else
395 {
396 m_error = getError(m_ecode);
397 return false;
398 }
399}
400
401
402// -------------------------------------------------------------------
403bool
404PerlRegEx::execute(MatchVector &sub, const String &str,
405 size_t index, size_t count, int eflags)
406{
407 if( m_pcre == NULL)
408 {
409 BLOCXX_THROW(RegExCompileException,
410 "Regular expression is not compiled");
411 }
412 if( count >= size_t(INT_MAX / 3))
413 {
414 BLOCXX_THROW(AssertionException,
415 "Match count limit exceeded");
416 }
417
418 if( index > str.length())
419 {
420 BLOCXX_THROW(OutOfBoundsException,
421 Format("String index out of bounds ("
422 "length = %1, index = %2)",
423 str.length(), index
424 ).c_str());
425 }
426
427 if( count == 0)
428 {
429 int cnt = 0;
430 int ret = ::pcre_fullinfo(m_pcre, NULL,
431 PCRE_INFO_CAPTURECOUNT, &cnt);
432 if( ret)
433 {
434 m_error = getError(m_ecode);
435 return false;
436 }
437 count = cnt > 0 ? cnt + 1 : 1;
438 }
439 int vsub[count * 3];
440
441 sub.clear();
442 m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(), str.length(),
443 index, eflags, vsub, count * 3);
444 //
445 // pcre_exec returns 0 if vector too small, negative value
446 // on errors or the number of matches (number of int pairs)
447 //
448 if( m_ecode > 0)
449 {
450 count *= 2;
451 m_ecode *= 2;
452 sub.resize(count); // as specified by user
453 for(size_t i = 0; i < count; i++)
454 {
455 // if user wants more than detected
456 if( i >= (size_t)m_ecode)
457 vsub[i] = -1;
458
459 sub[i] = vsub[i];
460 }
461 return true;
462 }
463 else
464 {
465 m_error = getError(m_ecode);
466 return false;
467 }
468}
469
470
471// -------------------------------------------------------------------
473PerlRegEx::capture(const String &str, size_t index, size_t count, int eflags)
474{
475 if( m_pcre == NULL)
476 {
477 BLOCXX_THROW(RegExCompileException,
478 "Regular expression is not compiled");
479 }
480
481 MatchArray rsub;
482 StringArray ssub;
483
484 bool match = execute(rsub, str, index, count, eflags);
485 if( match)
486 {
487 if( rsub.empty())
488 {
489 BLOCXX_THROW(RegExCompileException,
490 "Non-capturing regular expression");
491 }
492
493 MatchArray::const_iterator i=rsub.begin();
494 for( ; i != rsub.end(); ++i)
495 {
496 if( i->rm_so >= 0 && i->rm_eo >= 0)
497 {
498 ssub.push_back(str.substring(i->rm_so,
499 i->rm_eo - i->rm_so));
500 }
501 else
502 {
503 ssub.push_back(String(""));
504 }
505 }
506 }
507 else if(m_ecode != PCRE_ERROR_NOMATCH)
508 {
509 BLOCXX_THROW_ERR(RegExExecuteException,
510 errorString().c_str(), m_ecode);
511 }
512 return ssub;
513}
514
515
516// -------------------------------------------------------------------
517blocxx::String
518PerlRegEx::replace(const String &str, const String &rep,
519 bool global, int eflags)
520{
521 if( m_pcre == NULL)
522 {
523 BLOCXX_THROW(RegExCompileException,
524 "Regular expression is not compiled");
525 }
526
527 MatchArray rsub;
528 bool match;
529 size_t off = 0;
530 String out = str;
531
532 do
533 {
534 match = execute(rsub, out, off, 0, eflags);
535 if( match)
536 {
537 if( rsub.empty() ||
538 rsub[0].rm_so < 0 ||
539 rsub[0].rm_eo < 0)
540 {
541 // only if empty (missused as guard).
542 BLOCXX_THROW(RegExCompileException,
543 "Non-capturing regular expression");
544 }
545
546 String res = substitute_caps(rsub, out, rep);
547
548 out = out.substring(0, rsub[0].rm_so) +
549 res + out.substring(rsub[0].rm_eo);
550
551 off = rsub[0].rm_so + res.length();
552 }
553 else if(m_ecode == PCRE_ERROR_NOMATCH)
554 {
555 m_ecode = 0;
556 m_error.erase();
557 }
558 else
559 {
560 BLOCXX_THROW_ERR(RegExExecuteException,
561 errorString().c_str(), m_ecode);
562 }
563 } while(global && match && out.length() > off);
564
565 return out;
566}
567
568
569// -------------------------------------------------------------------
571PerlRegEx::split(const String &str, bool empty, int eflags)
572{
573 if( m_pcre == NULL)
574 {
575 BLOCXX_THROW(RegExCompileException,
576 "Regular expression is not compiled");
577 }
578
579 MatchArray rsub;
580 StringArray ssub;
581 bool match;
582 size_t off = 0;
583 size_t len = str.length();
584
585 do
586 {
587 match = execute(rsub, str, off, 0, eflags);
588 if( match)
589 {
590 if( rsub.empty() ||
591 rsub[0].rm_so < 0 ||
592 rsub[0].rm_eo < 0)
593 {
594 BLOCXX_THROW(RegExCompileException,
595 "Non-capturing regular expression");
596 }
597
598 if( empty || ((size_t)rsub[0].rm_so > off))
599 {
600 ssub.push_back(str.substring(off,
601 rsub[0].rm_so - off));
602 }
603 off = rsub[0].rm_eo;
604 }
605 else if(m_ecode == PCRE_ERROR_NOMATCH)
606 {
607 String tmp = str.substring(off);
608 if( empty || !tmp.empty())
609 {
610 ssub.push_back(tmp);
611 }
612 m_ecode = 0;
613 m_error.erase();
614 }
615 else
616 {
617 BLOCXX_THROW_ERR(RegExExecuteException,
618 errorString().c_str(), m_ecode);
619 }
620 } while(match && len > off);
621
622 return ssub;
623}
624
625
626// -------------------------------------------------------------------
628PerlRegEx::grep(const StringArray &src, int eflags)
629{
630 if( m_pcre == NULL)
631 {
632 BLOCXX_THROW(RegExCompileException,
633 "Regular expression is not compiled");
634 }
635
636 m_ecode = 0;
637 m_error.erase();
638
639 StringArray out;
640 if( !src.empty())
641 {
642 StringArray::const_iterator i=src.begin();
643 for( ; i != src.end(); ++i)
644 {
645 int ret = ::pcre_exec(m_pcre, NULL, i->c_str(),
646 i->length(), 0, eflags, NULL, 0);
647 if( ret >= 0)
648 {
649 out.push_back(*i);
650 }
651 else if( ret != PCRE_ERROR_NOMATCH)
652 {
653 m_ecode = ret;
654 m_error = getError(m_ecode);
655 BLOCXX_THROW_ERR(RegExExecuteException,
656 errorString().c_str(), m_ecode);
657 }
658 }
659 }
660 return out;
661}
662
663
664// -------------------------------------------------------------------
665bool
666PerlRegEx::match(const String &str, size_t index, int eflags) const
667{
668 if( m_pcre == NULL)
669 {
670 BLOCXX_THROW(RegExCompileException,
671 "Regular expression is not compiled");
672 }
673
674 if( index > str.length())
675 {
676 BLOCXX_THROW(OutOfBoundsException,
677 Format("String index out of bounds."
678 "length = %1, index = %2",
679 str.length(), index
680 ).c_str());
681 }
682
683 m_ecode = ::pcre_exec(m_pcre, NULL, str.c_str(),
684 str.length(), 0, eflags, NULL, 0);
685 if( m_ecode >= 0)
686 {
687 m_error.erase();
688 return true;
689 }
690 else if( m_ecode == PCRE_ERROR_NOMATCH)
691 {
692 m_error = getError(m_ecode);
693 return false;
694 }
695 else
696 {
697 m_error = getError(m_ecode);
698 BLOCXX_THROW_ERR(RegExExecuteException,
699 errorString().c_str(), m_ecode);
700 }
701}
702
703
704// -------------------------------------------------------------------
705} // namespace BLOCXX_NAMESPACE
706
707#endif // BLOCXX_HAVE_PCRE_H
708#endif // BLOCXX_HAVE_PCRE
709
710/* vim: set ts=8 sts=8 sw=8 ai noet: */
711
#define BLOCXX_THROW(exType, msg)
Throw an exception using FILE and LINE.
#define BLOCXX_THROW_ERR(exType, msg, err)
Throw an exception using FILE and LINE.
iterator erase(iterator position)
Remove an element of the Array specified with an iterator.
This String class is an abstract data type that represents as NULL terminated string of characters.
Definition String.hpp:67
size_t indexOf(char ch, size_t fromIndex=0) const
Find the first occurence of a given character in this String object.
Definition String.cpp:556
static const size_t npos
Definition String.hpp:742
Taken from RFC 1321.
Array< String > StringArray
Definition CommonFwd.hpp:73