pcremodule.c 14.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/* Pcre objects */

#include "Python.h"

#ifndef Py_eval_input
/* For Python 1.4, graminit.h has to be explicitly included */
#include "graminit.h"
#define Py_eval_input eval_input
#endif

#ifndef FOR_PYTHON
#define FOR_PYTHON
#endif

#include "pcre.h"
16
#include "pcre-int.h"
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42

static PyObject *ErrorObject;

typedef struct {
	PyObject_HEAD
	pcre *regex;
	pcre_extra *regex_extra;
        int num_groups;
} PcreObject;

staticforward PyTypeObject Pcre_Type;

#define PcreObject_Check(v)	((v)->ob_type == &Pcre_Type)
#define NORMAL			0
#define CHARCLASS		1
#define REPLACEMENT		2

#define CHAR 			0
#define MEMORY_REFERENCE 	1
#define SYNTAX 			2
#define NOT_SYNTAX 		3
#define SET			4
#define WORD_BOUNDARY		5
#define NOT_WORD_BOUNDARY	6
#define BEGINNING_OF_BUFFER	7
#define END_OF_BUFFER		8
Guido van Rossum's avatar
Guido van Rossum committed
43
#define STRING                  9
44 45

static PcreObject *
Peter Schneider-Kamp's avatar
Peter Schneider-Kamp committed
46
newPcreObject(PyObject *args)
47 48
{
	PcreObject *self;
49
	self = PyObject_New(PcreObject, &Pcre_Type);
50 51 52 53 54 55 56 57 58 59
	if (self == NULL)
		return NULL;
	self->regex = NULL;
	self->regex_extra = NULL;
	return self;
}

/* Pcre methods */

static void
Peter Schneider-Kamp's avatar
Peter Schneider-Kamp committed
60
PyPcre_dealloc(PcreObject *self)
61
{
62 63
	if (self->regex) (pcre_free)(self->regex);
	if (self->regex_extra) (pcre_free)(self->regex_extra);
64
	PyObject_Del(self);
65 66 67 68
}


static PyObject *
Peter Schneider-Kamp's avatar
Peter Schneider-Kamp committed
69
PyPcre_exec(PcreObject *self, PyObject *args)
70
{
71 72 73
        char *string;
	int stringlen, pos = 0, options=0, endpos = -1, i, count;
	int offsets[100*2]; 
74 75
	PyObject *list;

76 77
	if (!PyArg_ParseTuple(args, "t#|iii:match", &string, &stringlen, 
                                     &pos, &endpos, &options))
78
		return NULL;
79
	if (endpos == -1) {endpos = stringlen;}
80
	count = pcre_exec(self->regex, self->regex_extra, 
81
			  string, endpos, pos, options,
82
			  offsets, sizeof(offsets)/sizeof(int) );
83 84 85 86 87 88 89 90
	/* If an error occurred during the match, and an exception was raised,
	   just return NULL and leave the exception alone.  The most likely
	   problem to cause this would be running out of memory for
	   the failure stack. */
	if (PyErr_Occurred())
	{
		return NULL;
	}
91 92
	if (count==PCRE_ERROR_NOMATCH) {Py_INCREF(Py_None); return Py_None;}
	if (count<0)
93
	{
94 95 96
		PyObject *errval = Py_BuildValue("si", "Regex execution error", count);
		PyErr_SetObject(ErrorObject, errval);
		Py_XDECREF(errval);
97 98
		return NULL;
	}
99 100 101 102
	
	list=PyList_New(self->num_groups+1);
	if (list==NULL) return NULL;
	for(i=0; i<=self->num_groups; i++)
103 104 105 106 107 108 109 110 111 112
	{
		PyObject *v;
		int start=offsets[i*2], end=offsets[i*2+1];
		/* If the group wasn't affected by the match, return -1, -1 */
		if (start<0 || count<=i) 
		{start=end=-1;}
		v=Py_BuildValue("ii", start, end);
		if (v==NULL) {Py_DECREF(list); return NULL;}
		PyList_SetItem(list, i, v);
	}
113 114 115 116 117 118 119 120 121
	return list;
}

static PyMethodDef Pcre_methods[] = {
	{"match",	(PyCFunction)PyPcre_exec,	1},
	{NULL,		NULL}		/* sentinel */
};

static PyObject *
Peter Schneider-Kamp's avatar
Peter Schneider-Kamp committed
122
PyPcre_getattr(PcreObject *self, char *name)
123 124 125 126 127 128
{
	return Py_FindMethod(Pcre_methods, (PyObject *)self, name);
}


staticforward PyTypeObject Pcre_Type = {
129
	PyObject_HEAD_INIT(NULL)
130
	0,			/*ob_size*/
131
	"pcre.Pcre",		/*tp_name*/
132 133 134 135 136
	sizeof(PcreObject),	/*tp_basicsize*/
	0,			/*tp_itemsize*/
	/* methods */
	(destructor)PyPcre_dealloc, /*tp_dealloc*/
	0,			/*tp_print*/
137
	(getattrfunc)PyPcre_getattr, /*tp_getattr*/
138 139 140 141 142 143 144 145 146 147 148
	0,                      /*tp_setattr*/
	0,			/*tp_compare*/
	0,			/*tp_repr*/
	0,			/*tp_as_number*/
	0,			/*tp_as_sequence*/
	0,			/*tp_as_mapping*/
	0,			/*tp_hash*/
};
/* --------------------------------------------------------------------- */

static PyObject *
Peter Schneider-Kamp's avatar
Peter Schneider-Kamp committed
149
PyPcre_compile(PyObject *self, PyObject *args)
150 151 152
{
	PcreObject *rv;
	PyObject *dictionary;
Guido van Rossum's avatar
Guido van Rossum committed
153
	char *pattern;
154
	const char *error;
155
	
Guido van Rossum's avatar
Guido van Rossum committed
156
	int options, erroroffset;
157
	if (!PyArg_ParseTuple(args, "siO!:pcre_compile", &pattern, &options,
158 159 160 161
			      &PyDict_Type, &dictionary))
		return NULL;
	rv = newPcreObject(args);
	if ( rv == NULL )
162
		return NULL;
163

Guido van Rossum's avatar
Guido van Rossum committed
164
	rv->regex = pcre_compile((char*)pattern, options, 
165 166
				 &error, &erroroffset, dictionary);
	if (rv->regex==NULL) 
167
	{
168
		Py_DECREF(rv);
169 170
		if (!PyErr_Occurred())
		{
171 172 173
			PyObject *errval = Py_BuildValue("si", error, erroroffset);
			PyErr_SetObject(ErrorObject, errval);
			Py_XDECREF(errval);
174 175 176
		}
		return NULL;
	}
177 178
	rv->regex_extra=pcre_study(rv->regex, 0, &error);
	if (rv->regex_extra==NULL && error!=NULL) 
179
	{
180
		PyObject *errval = Py_BuildValue("si", error, 0);
181
		Py_DECREF(rv);
182 183
		PyErr_SetObject(ErrorObject, errval);
		Py_XDECREF(errval);
184 185
		return NULL;
	}
186 187
        rv->num_groups = pcre_info(rv->regex, NULL, NULL);
	if (rv->num_groups<0) 
188
	{
189 190 191
		PyObject *errval = Py_BuildValue("si", error, rv->num_groups);
		PyErr_SetObject(ErrorObject, errval);
		Py_XDECREF(errval);
192
		Py_DECREF(rv);
193 194
		return NULL;
	}
195 196 197 198
	return (PyObject *)rv;
}

static PyObject *
Peter Schneider-Kamp's avatar
Peter Schneider-Kamp committed
199 200
PyPcre_expand_escape(unsigned char *pattern, int pattern_len,
                     int *indexptr, int *typeptr)
201
{
202 203
	unsigned char c;
	int index = *indexptr;
204
  
205
	if (pattern_len<=index)
206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235
	{
		PyErr_SetString(ErrorObject, "escape ends too soon");
		return NULL;
	}
	c=pattern[index]; index++;
	*typeptr=CHAR;

	switch (c)
	{
	case('t'):
		*indexptr=index;
		return Py_BuildValue("c", (char)9);
	case('n'):
		*indexptr = index;
		return Py_BuildValue("c", (char)10);
	case('v'):
		*indexptr = index;
		return Py_BuildValue("c", (char)11);
	case('r'):
		*indexptr = index;
		return Py_BuildValue("c", (char)13);
	case('f'):
		*indexptr = index;
		return Py_BuildValue("c", (char)12);
	case('a'):
		*indexptr = index;
		return Py_BuildValue("c", (char)7);
	case('b'):
		*indexptr=index;
		return Py_BuildValue("c", (char)8);
Guido van Rossum's avatar
Guido van Rossum committed
236 237 238
	case('\\'):
		*indexptr=index;
		return Py_BuildValue("c", '\\');
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265

	case('x'):
	{
		int x, ch, end;

		x = 0; end = index;
		while ( (end<pattern_len && pcre_ctypes[ pattern[end] ] & ctype_xdigit) != 0)
		{
			ch = pattern[end];
			x = x * 16 + pcre_lcc[ch] -
				(((pcre_ctypes[ch] & ctype_digit) != 0)? '0' : 'W');
			x &= 255;
			end++;
		}
		if (end==index)
		{
			PyErr_SetString(ErrorObject, "\\x must be followed by hex digits");
			return NULL;
		}
		*indexptr = end;
		return Py_BuildValue("c", (char)x);
	}

	case('E'):    case('G'):    case('L'):    case('Q'):
	case('U'):    case('l'):    case('u'):
	{
		char message[50];
266 267
		PyOS_snprintf(message, sizeof(message),
			      "\\%c is not allowed", c);
268 269 270 271 272 273 274
		PyErr_SetString(ErrorObject, message);
		return NULL;
	}

	case('g'):
	{
		int end, i;
Guido van Rossum's avatar
Guido van Rossum committed
275 276
		int group_num = 0, is_number=0;

277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302
		if (pattern_len<=index)
		{
			PyErr_SetString(ErrorObject, "unfinished symbolic reference");
			return NULL;
		}
		if (pattern[index]!='<')
		{
			PyErr_SetString(ErrorObject, "missing < in symbolic reference");
			return NULL;
		}
		index++;
		end=index;
		while (end<pattern_len && pattern[end]!='>')
			end++;
		if (end==pattern_len)
		{
			PyErr_SetString(ErrorObject, "unfinished symbolic reference");
			return NULL;
		}

		if (index==end)		/* Zero-length name */
		{
			/* XXX should include the text of the reference */
			PyErr_SetString(ErrorObject, "zero-length symbolic reference");
			return NULL;
		}
Guido van Rossum's avatar
Guido van Rossum committed
303
		if ((pcre_ctypes[pattern[index]] & ctype_digit)) /* First char. a digit */
304
		{
Guido van Rossum's avatar
Guido van Rossum committed
305 306
		        is_number = 1;
			group_num = pattern[index] - '0';
307 308 309 310
		}

		for(i=index+1; i<end; i++)
		{
Guido van Rossum's avatar
Guido van Rossum committed
311 312 313 314 315 316 317 318
		        if (is_number && 
			    !(pcre_ctypes[pattern[i]] & ctype_digit) )
			{
				/* XXX should include the text of the reference */
				PyErr_SetString(ErrorObject, "illegal non-digit character in \\g<...> starting with digit");
				return NULL;			       
			}
			else {group_num = group_num * 10 + pattern[i] - '0';}
319 320 321 322 323 324 325
			if (!(pcre_ctypes[pattern[i]] & ctype_word) )
			{
				/* XXX should include the text of the reference */
				PyErr_SetString(ErrorObject, "illegal symbolic reference");
				return NULL;
			}
		}	
326
	    
327
		*typeptr = MEMORY_REFERENCE;
328
		*indexptr = end+1;
Guido van Rossum's avatar
Guido van Rossum committed
329 330 331
		/* If it's a number, return the integer value of the group */
		if (is_number) return Py_BuildValue("i", group_num);
		/* Otherwise, return a string containing the group name */
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355
		return Py_BuildValue("s#", pattern+index, end-index);
	}

	case('0'):
	{
		/* \0 always indicates an octal escape, so we consume up to 3
		   characters, as long as they're all octal digits */
		int octval=0, i;
		index--;
		for(i=index;
		    i<=index+2 && i<pattern_len 
			    && (pcre_ctypes[ pattern[i] ] & ctype_odigit );
		    i++)
		{
			octval = octval * 8 + pattern[i] - '0';
		}
		if (octval>255)
		{
			PyErr_SetString(ErrorObject, "octal value out of range");
			return NULL;
		}
		*indexptr = i;
		return Py_BuildValue("c", (unsigned char)octval);
	}
356

357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413
	case('1'):    case('2'):    case('3'):    case('4'):
	case('5'):    case('6'):    case('7'):    case('8'):
	case('9'):
	{
		/* Handle \?, where ? is from 1 through 9 */
		int value=0;
		index--;
		/* If it's at least a two-digit reference, like \34, it might
		   either be a 3-digit octal escape (\123) or a 2-digit
		   decimal memory reference (\34) */

		if ( (index+1) <pattern_len && 
		     (pcre_ctypes[ pattern[index+1] ] & ctype_digit) )
		{
			if ( (index+2) <pattern_len && 
			     (pcre_ctypes[ pattern[index+2] ] & ctype_odigit) &&
			     (pcre_ctypes[ pattern[index+1] ] & ctype_odigit) &&
			     (pcre_ctypes[ pattern[index  ] ] & ctype_odigit)
				)
			{
				/* 3 octal digits */
				value= 8*8*(pattern[index  ]-'0') +
					8*(pattern[index+1]-'0') +
					(pattern[index+2]-'0');
				if (value>255)
				{
					PyErr_SetString(ErrorObject, "octal value out of range");
					return NULL;
				}
				*indexptr = index+3;
				return Py_BuildValue("c", (unsigned char)value);
			}
			else
			{
				/* 2-digit form, so it's a memory reference */
				value= 10*(pattern[index  ]-'0') +
					(pattern[index+1]-'0');
				if (value<1 || EXTRACT_MAX<=value)
				{
					PyErr_SetString(ErrorObject, "memory reference out of range");
					return NULL;
				}
				*typeptr = MEMORY_REFERENCE;
				*indexptr = index+2;
				return Py_BuildValue("i", value);
			}
		}
		else 
		{
			/* Single-digit form, like \2, so it's a memory reference */
			*typeptr = MEMORY_REFERENCE;
			*indexptr = index+1;
			return Py_BuildValue("i", pattern[index]-'0');
		}
	}

	default:
Guido van Rossum's avatar
Guido van Rossum committed
414 415 416
	  /* It's some unknown escape like \s, so return a string containing
	     \s */
		*typeptr = STRING;
417
		*indexptr = index;
Guido van Rossum's avatar
Guido van Rossum committed
418
		return Py_BuildValue("s#", pattern+index-2, 2);
419
	}
420 421 422
}

static PyObject *
Peter Schneider-Kamp's avatar
Peter Schneider-Kamp committed
423
PyPcre_expand(PyObject *self, PyObject *args)
424
{
425 426 427 428 429
	PyObject *results, *match_obj;
	PyObject *repl_obj, *newstring;
	unsigned char *repl;
	int size, total_len, i, start, pos;

430
	if (!PyArg_ParseTuple(args, "OS:pcre_expand", &match_obj, &repl_obj)) 
431 432 433 434 435 436 437 438 439 440 441 442 443 444 445
		return NULL;

	repl=(unsigned char *)PyString_AsString(repl_obj);
	size=PyString_Size(repl_obj);
	results=PyList_New(0);
	if (results==NULL) return NULL;
	for(start=total_len=i=0; i<size; i++)
	{
		if (repl[i]=='\\')
		{
			PyObject *value;
			int escape_type;

			if (start!=i)
			{
446 447 448 449 450 451 452 453 454 455 456 457 458
				int status;
				PyObject *s = PyString_FromStringAndSize(
					(char *)repl+start, i-start);
				if (s == NULL) {
					Py_DECREF(results);
					return NULL;
				}
				status = PyList_Append(results, s);
				Py_DECREF(s);
				if (status < 0) {
					Py_DECREF(results);
					return NULL;
				}
459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
				total_len += i-start;
			}
			i++;
			value=PyPcre_expand_escape(repl, size, &i, &escape_type);
			if (value==NULL)
			{
				/* PyPcre_expand_escape triggered an exception of some sort,
				   so just return */
				Py_DECREF(results);
				return NULL;
			}
			switch (escape_type)
			{
			case (CHAR):
				PyList_Append(results, value);
				total_len += PyString_Size(value);
				break;
			case(MEMORY_REFERENCE):
			{
				PyObject *r, *tuple, *result;
				r=PyObject_GetAttrString(match_obj, "group");
480 481 482 483
				if (r == NULL) {
					Py_DECREF(results);
					return NULL;
				}
484 485 486 487 488 489 490
				tuple=PyTuple_New(1);
				Py_INCREF(value);
				PyTuple_SetItem(tuple, 0, value);
				result=PyEval_CallObject(r, tuple);
				Py_DECREF(r); Py_DECREF(tuple);
				if (result==NULL)
				{
491
					/* The group() method triggered an exception of some sort */
492 493 494 495 496 497 498
					Py_DECREF(results);
					Py_DECREF(value);
					return NULL;
				}
				if (result==Py_None)
				{
					char message[50];
499
					PyOS_snprintf(message, sizeof(message),
500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521
						"group did not contribute to the match");
					PyErr_SetString(ErrorObject, 
							message);
					Py_DECREF(result);
					Py_DECREF(value);
					Py_DECREF(results);
					return NULL;
				}
				/* typecheck that it's a string! */
				if (!PyString_Check(result))
				{
					Py_DECREF(results);
					Py_DECREF(result);
					PyErr_SetString(ErrorObject, 
							"group() must return a string value for replacement");
					return NULL;
				}
				PyList_Append(results, result);
				total_len += PyString_Size(result);
				Py_DECREF(result);
			}
			break;
Guido van Rossum's avatar
Guido van Rossum committed
522 523 524 525 526 527
			case(STRING):
			  {
			    PyList_Append(results, value);
			    total_len += PyString_Size(value);
			    break;
			  }
528 529 530 531 532 533 534 535 536 537 538 539 540
			default:
				Py_DECREF(results);
				PyErr_SetString(ErrorObject, 
						"bad escape in replacement");
				return NULL;
			}
			Py_DECREF(value);
			start=i;
			i--; /* Decrement now, because the 'for' loop will increment it */
		}
	} /* endif repl[i]!='\\' */

	if (start!=i)
541
	{
542 543 544 545 546 547 548 549 550 551 552 553 554
		int status;
		PyObject *s = PyString_FromStringAndSize((char *)repl+start, 
							 i-start);
		if (s == NULL) {
			Py_DECREF(results);
			return NULL;
		}
		status = PyList_Append(results, s);
		Py_DECREF(s);
		if (status < 0) {
			Py_DECREF(results);
			return NULL;
		}
555
		total_len += i-start;
556
	}
557 558 559 560 561

	/* Whew!  Now we've constructed a list containing various pieces of
	   strings that will make up our final result.  So, iterate over 
	   the list concatenating them.  A new string measuring total_len
	   bytes is allocated and filled in. */
562
     
563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578
	newstring=PyString_FromStringAndSize(NULL, total_len);
	if (newstring==NULL)
	{
		Py_DECREF(results);
		return NULL;
	}

	repl=(unsigned char *)PyString_AsString(newstring);
	for (pos=i=0; i<PyList_Size(results); i++)
	{
		PyObject *item=PyList_GetItem(results, i);
		memcpy(repl+pos, PyString_AsString(item), PyString_Size(item) );
		pos += PyString_Size(item);
	}
	Py_DECREF(results);
	return newstring;
579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596
}


/* List of functions defined in the module */

static PyMethodDef pcre_methods[] = {
	{"pcre_compile",		PyPcre_compile,		1},
	{"pcre_expand",		PyPcre_expand,		1},
	{NULL,		NULL}		/* sentinel */
};


/*
 * Convenience routine to export an integer value.
 * For simplicity, errors (which are unlikely anyway) are ignored.
 */

static void
Peter Schneider-Kamp's avatar
Peter Schneider-Kamp committed
597
insint(PyObject *d, char *name, int value)
598 599 600 601 602 603 604 605 606 607 608 609 610 611 612
{
	PyObject *v = PyInt_FromLong((long) value);
	if (v == NULL) {
		/* Don't bother reporting this error */
		PyErr_Clear();
	}
	else {
		PyDict_SetItemString(d, name, v);
		Py_DECREF(v);
	}
}


/* Initialization function for the module (*must* be called initpcre) */

613
DL_EXPORT(void)
614
initpcre(void)
615 616 617
{
	PyObject *m, *d;

618 619
        Pcre_Type.ob_type = &PyType_Type;

620 621 622 623 624
	/* Create the module and add the functions */
	m = Py_InitModule("pcre", pcre_methods);

	/* Add some symbolic constants to the module */
	d = PyModule_GetDict(m);
625
	ErrorObject = PyErr_NewException("pcre.error", NULL, NULL);
626 627 628 629 630 631 632 633
	PyDict_SetItemString(d, "error", ErrorObject);

	/* Insert the flags */
	insint(d, "IGNORECASE", PCRE_CASELESS);
	insint(d, "ANCHORED", PCRE_ANCHORED);
	insint(d, "MULTILINE", PCRE_MULTILINE);
	insint(d, "DOTALL", PCRE_DOTALL);
	insint(d, "VERBOSE", PCRE_EXTENDED);
634
	insint(d, "LOCALE", PCRE_LOCALE);
635 636
}