diff src/regex.c @ 185:3d6bfa290dbd r20-3b19

Import from CVS: tag r20-3b19
author cvs
date Mon, 13 Aug 2007 09:55:28 +0200
parents e121b013d1f0
children 850242ba4a81
line wrap: on
line diff
--- a/src/regex.c	Mon Aug 13 09:54:24 2007 +0200
+++ b/src/regex.c	Mon Aug 13 09:55:28 2007 +0200
@@ -188,7 +188,7 @@
    if (done)
      return;
 
-   bzero (re_syntax_table, sizeof re_syntax_table);
+   memset (re_syntax_table, 0, sizeof (re_syntax_table));
 
    for (c = 'a'; c <= 'z'; c++)
      re_syntax_table[c] = Sword;
@@ -248,7 +248,7 @@
 #define ISASCII(c) (((unsigned EMACS_INT) (c)) < 0x100 && ISASCII_1 (c))
 #else
 #define ISASCII(c) ISASCII_1 (c)
-#endif
+#endif /* MULE */
 
 #ifdef isblank
 #define ISBLANK(c) (ISASCII (c) && isblank (c))
@@ -550,7 +550,7 @@
      2.3 code to enable some language specific processing */
   ,categoryspec,     /* Matches entries in the character category tables */
   notcategoryspec    /* The opposite of the above */
-#endif
+#endif /* MULE */
 
 } re_opcode_t;
 
@@ -1806,7 +1806,7 @@
 					     char *translate,
 					     reg_syntax_t syntax,
 					     Lisp_Object rtab);
-#endif
+#endif /* MULE */
 static boolean group_match_null_string_p (unsigned char **p,
 					  unsigned char *end,
 					  register_info_type *reg_info);
@@ -2058,32 +2058,26 @@
             }
 
           {
-            /* Are we optimizing this jump?  */
-            boolean keep_string_p = false;
-
-            /* 1 means zero (many) matches is allowed.  */
-            char zero_times_ok = 0, many_times_ok = 0;
+	    /* true means zero/many matches are allowed. */
+	    boolean zero_times_ok = c != '+';
+            boolean many_times_ok = c != '?';
+
+            /* true means match shortest string possible. */
+            boolean minimal = false;
 
             /* If there is a sequence of repetition chars, collapse it
                down to just one (the right one).  We can't combine
                interval operators with these because of, e.g., `a{2}*',
-               which should only match an even number of `a's.  */
-
-            for (;;)
+               which should only match an even number of `a's.	*/
+            while (p != pend)
               {
-                zero_times_ok |= c != '+';
-                many_times_ok |= c != '?';
-
-                if (p == pend)
-                  break;
-
                 PATFETCH (c);
 
-                if (c == '*'
-                    || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
+                if (c == '*' || (!(syntax & RE_BK_PLUS_QM)
+                                 && (c == '+' || c == '?')))
                   ;
 
-                else if (syntax & RE_BK_PLUS_QM  &&  c == '\\')
+                else if (syntax & RE_BK_PLUS_QM && c == '\\')
                   {
                     if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
 
@@ -2104,73 +2098,137 @@
                   }
 
                 /* If we get here, we found another repeat character.  */
-               }
+                if (!(syntax & RE_NO_MINIMAL_MATCHING))
+                  {
+                    /* `*?' and `+?' and `??' are okay (and mean match
+                       minimally), but other sequences (such as `*??' and
+                       `+++') are rejected (reserved for future use). */
+                    if (minimal || c != '?')
+                      FREE_STACK_RETURN (REG_BADRPT);
+                    minimal = true;
+                  }
+                else
+                  {
+                    zero_times_ok |= c != '+';
+                    many_times_ok |= c != '?';
+                  }
+              }
 
             /* Star, etc. applied to an empty pattern is equivalent
                to an empty pattern.  */
             if (!laststart)
               break;
 
-            /* Now we know whether or not zero matches is allowed
-               and also whether or not two or more matches is allowed.  */
-            if (many_times_ok)
-              { /* More than one repetition is allowed, so put in at the
-                   end a backward relative jump from `b' to before the next
-                   jump we're going to put in below (which jumps from
-                   laststart to after this jump).
-
-                   But if we are at the `*' in the exact sequence `.*\n',
-                   insert an unconditional jump backwards to the .,
-                   instead of the beginning of the loop.  This way we only
-                   push a failure point once, instead of every time
-                   through the loop.  */
-                assert (p - 1 > pattern);
-
-                /* Allocate the space for the jump.  */
-                GET_BUFFER_SPACE (3);
-
-                /* We know we are not at the first character of the pattern,
-                   because laststart was nonzero.  And we've already
-                   incremented `p', by the way, to be the character after
-                   the `*'.  Do we have to do something analogous here
-                   for null bytes, because of RE_DOT_NOT_NULL?  */
-                if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
-		    && zero_times_ok
-                    && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
-                    && !(syntax & RE_DOT_NEWLINE))
-                  { /* We have .*\n.  */
-                    STORE_JUMP (jump, b, laststart);
-                    keep_string_p = true;
+	    /* Now we know whether zero matches is allowed
+	       and whether two or more matches is allowed
+               and whether we want minimal or maximal matching. */
+            if (minimal)
+              {
+                if (!many_times_ok)
+                  {
+                    /* "a??" becomes:
+                       0: /on_failure_jump to 6
+                       3: /jump to 9
+                       6: /exactn/1/A
+                       9: end of pattern.
+                     */
+                    GET_BUFFER_SPACE (6);
+                    INSERT_JUMP (jump, laststart, b + 3);
+                    b += 3;
+                    INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
+                    b += 3;
+                  }
+                else if (zero_times_ok)
+                  {
+                    /* "a*?" becomes:
+                       0: /jump to 6
+                       3: /exactn/1/A
+                       6: /on_failure_jump to 3
+                       9: end of pattern.
+                     */
+                    GET_BUFFER_SPACE (6);
+                    INSERT_JUMP (jump, laststart, b + 3);
+                    b += 3;
+                    STORE_JUMP (on_failure_jump, b, laststart + 3);
+                    b += 3;
                   }
                 else
-                  /* Anything else.  */
-                  STORE_JUMP (maybe_pop_jump, b, laststart - 3);
-
-                /* We've added more stuff to the buffer.  */
-                b += 3;
+                  {
+                    /* "a+?" becomes:
+                       0: /exactn/1/A
+                       3: /on_failure_jump to 0
+                       6: end of pattern.
+                     */
+                    GET_BUFFER_SPACE (3);
+                    STORE_JUMP (on_failure_jump, b, laststart);
+                    b += 3;
+                  }
               }
-
-            /* On failure, jump from laststart to b + 3, which will be the
-               end of the buffer after this jump is inserted.  */
-            GET_BUFFER_SPACE (3);
-            INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
-                                       : on_failure_jump,
-                         laststart, b + 3);
+            else
+              {
+                /* Are we optimizing this jump?  */
+                boolean keep_string_p = false;
+
+                if (many_times_ok)
+                  { /* More than one repetition is allowed, so put in at the
+                       end a backward relative jump from `b' to before the next
+                       jump we're going to put in below (which jumps from
+                       laststart to after this jump).
+
+                       But if we are at the `*' in the exact sequence `.*\n',
+                       insert an unconditional jump backwards to the .,
+                       instead of the beginning of the loop.  This way we only
+                       push a failure point once, instead of every time
+                       through the loop.  */
+                    assert (p - 1 > pattern);
+
+                    /* Allocate the space for the jump.  */
+                    GET_BUFFER_SPACE (3);
+
+                    /* We know we are not at the first character of the
+                       pattern, because laststart was nonzero.  And we've
+                       already incremented `p', by the way, to be the
+                       character after the `*'.  Do we have to do something
+                       analogous here for null bytes, because of
+                       RE_DOT_NOT_NULL? */
+                    if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
+                        && zero_times_ok
+                        && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
+                        && !(syntax & RE_DOT_NEWLINE))
+                      { /* We have .*\n.  */
+                        STORE_JUMP (jump, b, laststart);
+                        keep_string_p = true;
+                      }
+                    else
+                      /* Anything else.  */
+                      STORE_JUMP (maybe_pop_jump, b, laststart - 3);
+
+                    /* We've added more stuff to the buffer.  */
+                    b += 3;
+                  }
+
+                /* On failure, jump from laststart to b + 3, which will be the
+                   end of the buffer after this jump is inserted.  */
+                GET_BUFFER_SPACE (3);
+                INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
+					   : on_failure_jump,
+                             laststart, b + 3);
+                b += 3;
+
+                if (!zero_times_ok)
+                  {
+                    /* At least one repetition is required, so insert a
+                       `dummy_failure_jump' before the initial
+                       `on_failure_jump' instruction of the loop. This
+                       effects a skip over that instruction the first time
+                       we hit that loop.  */
+                    GET_BUFFER_SPACE (3);
+                    INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
+                    b += 3;
+                  }
+              }
             pending_exact = 0;
-            b += 3;
-
-            if (!zero_times_ok)
-              {
-                /* At least one repetition is required, so insert a
-                   `dummy_failure_jump' before the initial
-                   `on_failure_jump' instruction of the loop. This
-                   effects a skip over that instruction the first time
-                   we hit that loop.  */
-                GET_BUFFER_SPACE (3);
-                INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
-                b += 3;
-              }
-            }
+          }
 	  break;
 
 
@@ -2502,47 +2560,72 @@
                 goto normal_backslash;
 
             handle_open:
-              bufp->re_nsub++;
-              regnum++;
-
-              if (COMPILE_STACK_FULL)
-                {
-                  RETALLOC (compile_stack.stack, compile_stack.size << 1,
-                            compile_stack_elt_t);
-                  if (compile_stack.stack == NULL) return REG_ESPACE;
-
-                  compile_stack.size <<= 1;
-                }
-
-              /* These are the values to restore when we hit end of this
-                 group.  They are all relative offsets, so that if the
-                 whole pattern moves because of realloc, they will still
-                 be valid.  */
-              COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
-              COMPILE_STACK_TOP.fixup_alt_jump
-                = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
-              COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
-              COMPILE_STACK_TOP.regnum = regnum;
-
-              /* We will eventually replace the 0 with the number of
-                 groups inner to this one.  But do not push a
-                 start_memory for groups beyond the last one we can
-                 represent in the compiled pattern.  */
-              if (regnum <= MAX_REGNUM)
-                {
-                  COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
-                  BUF_PUSH_3 (start_memory, regnum, 0);
-                }
-
-              compile_stack.avail++;
-
-              fixup_alt_jump = 0;
-              laststart = 0;
-              begalt = b;
-	      /* If we've reached MAX_REGNUM groups, then this open
-		 won't actually generate any code, so we'll have to
-		 clear pending_exact explicitly.  */
-	      pending_exact = 0;
+              {
+                regnum_t r;
+
+                if (!(syntax & RE_NO_SHY_GROUPS)
+                    && p != pend
+                    && TRANSLATE(*p) == TRANSLATE('?'))
+                  {
+                    p++;
+                    PATFETCH(c);
+                    switch (c)
+                      {
+                      case ':': /* shy groups */
+                        r = MAX_REGNUM + 1;
+                        break;
+
+                      /* All others are reserved for future constructs. */
+                      default:
+                        FREE_STACK_RETURN (REG_BADPAT);
+                      }
+                  }
+                else
+                  {
+                    bufp->re_nsub++;
+                    r = ++regnum;
+                  }
+
+                if (COMPILE_STACK_FULL)
+                  {
+                    RETALLOC (compile_stack.stack, compile_stack.size << 1,
+                              compile_stack_elt_t);
+                    if (compile_stack.stack == NULL) return REG_ESPACE;
+
+                    compile_stack.size <<= 1;
+                  }
+
+                /* These are the values to restore when we hit end of this
+                   group.  They are all relative offsets, so that if the
+                   whole pattern moves because of realloc, they will still
+                   be valid.  */
+                COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
+                COMPILE_STACK_TOP.fixup_alt_jump
+                  = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
+                COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
+                COMPILE_STACK_TOP.regnum = r;
+
+                /* We will eventually replace the 0 with the number of
+                   groups inner to this one.  But do not push a
+                   start_memory for groups beyond the last one we can
+                   represent in the compiled pattern.  */
+                if (r <= MAX_REGNUM)
+                  {
+                    COMPILE_STACK_TOP.inner_group_offset
+                      = b - bufp->buffer + 2;
+                    BUF_PUSH_3 (start_memory, r, 0);
+                  }
+
+                compile_stack.avail++;
+
+                fixup_alt_jump = 0;
+                laststart = 0;
+                begalt = b;
+                /* If we've reached MAX_REGNUM groups, then this open
+                   won't actually generate any code, so we'll have to
+                   clear pending_exact explicitly.  */
+                pending_exact = 0;
+              }
               break;
 
 
@@ -3408,10 +3491,10 @@
 	  /* And all extended characters must be allowed, too. */
 	  for (j = 0x80; j < 0xA0; j++)
 	    fastmap[j] = 1;
-#else
+#else /* ! MULE */
 	  for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
             fastmap[j] = 1;
-#endif
+#endif /* ! MULE */
 
 	  for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
 	    if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
@@ -4299,6 +4382,13 @@
   unsigned num_regs_pushed = 0;
 #endif
 
+  /* 1 if this match ends in the same string (string1 or string2)
+     as the best previous match.  */
+  boolean same_str_p;
+
+  /* 1 if this match is the best seen so far.  */
+  boolean best_match_p;
+
   DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
 
   INIT_FAIL_STACK ();
@@ -4311,14 +4401,14 @@
      array indexing.  We should fix this.  */
   if (bufp->re_nsub)
     {
-      regstart = REGEX_TALLOC (num_regs, CONST char *);
-      regend = REGEX_TALLOC (num_regs, CONST char *);
-      old_regstart = REGEX_TALLOC (num_regs, CONST char *);
-      old_regend = REGEX_TALLOC (num_regs, CONST char *);
-      best_regstart = REGEX_TALLOC (num_regs, CONST char *);
-      best_regend = REGEX_TALLOC (num_regs, CONST char *);
-      reg_info = REGEX_TALLOC (num_regs, register_info_type);
-      reg_dummy = REGEX_TALLOC (num_regs, CONST char *);
+      regstart       = REGEX_TALLOC (num_regs, CONST char *);
+      regend         = REGEX_TALLOC (num_regs, CONST char *);
+      old_regstart   = REGEX_TALLOC (num_regs, CONST char *);
+      old_regend     = REGEX_TALLOC (num_regs, CONST char *);
+      best_regstart  = REGEX_TALLOC (num_regs, CONST char *);
+      best_regend    = REGEX_TALLOC (num_regs, CONST char *);
+      reg_info       = REGEX_TALLOC (num_regs, register_info_type);
+      reg_dummy      = REGEX_TALLOC (num_regs, CONST char *);
       reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
 
       if (!(regstart && regend && old_regstart && old_regend && reg_info
@@ -4421,12 +4511,8 @@
              longest match, try backtracking.  */
           if (d != end_match_2)
 	    {
-	      /* 1 if this match ends in the same string (string1 or string2)
-		 as the best previous match.  */
-	      boolean same_str_p = (FIRST_STRING_P (match_end)
-				    == MATCHING_IN_FIRST_STRING);
-	      /* 1 if this match is the best seen so far.  */
-	      boolean best_match_p;
+	      same_str_p = (FIRST_STRING_P (match_end)
+			    == MATCHING_IN_FIRST_STRING);
 
 	      /* AIX compiler got confused when this was combined
 		 with the previous declaration.  */
@@ -4687,7 +4773,7 @@
 	    INC_CHARPTR (d);
 	    break;
 	  }
-#endif
+#endif /* MULE */
 
 
         /* The beginning of a group is represented by start_memory.
@@ -5016,7 +5102,7 @@
           EXTRACT_NUMBER_AND_INCR (mcnt, p);
           DEBUG_PRINT3 (" %d (to 0x%p):\n", mcnt, p + mcnt);
 
-          PUSH_FAILURE_POINT (p + mcnt, (void *) 0, -2);
+          PUSH_FAILURE_POINT (p + mcnt, (char *) 0, -2);
           break;
 
 
@@ -5272,7 +5358,7 @@
           DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
           /* It doesn't matter what we push for the string here.  What
              the code at `fail' tests is the value for the pattern.  */
-          PUSH_FAILURE_POINT ((void *) 0, (void *) 0, -2);
+          PUSH_FAILURE_POINT ((unsigned char *) 0, (char *) 0, -2);
           goto unconditional_jump;
 
 
@@ -5285,7 +5371,7 @@
           DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
           /* See comments just above at `dummy_failure_jump' about the
              two zeroes.  */
-          PUSH_FAILURE_POINT ((void *) 0, (void *) 0, -2);
+          PUSH_FAILURE_POINT ((unsigned char *) 0, (char *) 0, -2);
           break;
 
         /* Have to succeed matching what follows at least n times.