github的一些开源项目
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

402 lines
11 KiB

  1. #! /bin/sh
  2. # Script for testing regular expressions with perl to check that PCRE2 handles
  3. # them the same. For testing with different versions of Perl, if the first
  4. # argument is -perl then the second is taken as the Perl command to use, and
  5. # both are then removed. If the next argument is "-w", Perl is called with
  6. # "-w", which turns on its warning mode.
  7. #
  8. # The Perl code has to have "use utf8" and "require Encode" at the start when
  9. # running UTF-8 tests, but *not* for non-utf8 tests. (The "require" would
  10. # actually be OK for non-utf8-tests, but is not always installed, so this way
  11. # the script will always run for these tests.)
  12. #
  13. # The desired effect is achieved by making this a shell script that passes the
  14. # Perl script to Perl through a pipe. If the next argument is "-utf8", a
  15. # suitable prefix is set up.
  16. #
  17. # The remaining arguments, if any, are passed to Perl. They are an input file
  18. # and an output file. If there is one argument, the output is written to
  19. # STDOUT. If Perl receives no arguments, it opens /dev/tty as input, and writes
  20. # output to STDOUT. (I haven't found a way of getting it to use STDIN, because
  21. # of the contorted piping input.)
  22. perl=perl
  23. perlarg=''
  24. prefix=''
  25. if [ $# -gt 1 -a "$1" = "-perl" ] ; then
  26. shift
  27. perl=$1
  28. shift
  29. fi
  30. if [ $# -gt 0 -a "$1" = "-w" ] ; then
  31. perlarg="-w"
  32. shift
  33. fi
  34. if [ $# -gt 0 -a "$1" = "-utf8" ] ; then
  35. prefix="use utf8; require Encode;"
  36. shift
  37. fi
  38. # The Perl script that follows has a similar specification to pcre2test, and so
  39. # can be given identical input, except that input patterns can be followed only
  40. # by Perl's lower case modifiers and certain other pcre2test modifiers that are
  41. # either handled or ignored:
  42. #
  43. # aftertext interpreted as "print $' afterwards"
  44. # afteralltext ignored
  45. # dupnames ignored (Perl always allows)
  46. # jitstack ignored
  47. # mark show mark information
  48. # no_auto_possess ignored
  49. # no_start_optimize insert (??{""}) at pattern start (disables optimizing)
  50. # -no_start_optimize ignored
  51. # subject_literal does not process subjects for escapes
  52. # ucp sets Perl's /u modifier
  53. # utf invoke UTF-8 functionality
  54. #
  55. # Comment lines are ignored. The #pattern command can be used to set modifiers
  56. # that will be added to each subsequent pattern, after any modifiers it may
  57. # already have. NOTE: this is different to pcre2test where #pattern sets
  58. # defaults which can be overridden on individual patterns. The #subject command
  59. # may be used to set or unset a default "mark" modifier for data lines. This is
  60. # the only use of #subject that is supported. The #perltest, #forbid_utf, and
  61. # #newline_default commands, which are needed in the relevant pcre2test files,
  62. # are ignored. Any other #-command is ignored, with a warning message.
  63. #
  64. # The pattern lines should use only / as the delimiter. The other characters
  65. # that pcre2test supports cause problems with this script.
  66. #
  67. # The data lines must not have any pcre2test modifiers. Unless
  68. # "subject_literal" is on the pattern, data lines are processed as
  69. # Perl double-quoted strings, so if they contain " $ or @ characters, these
  70. # have to be escaped. For this reason, all such characters in the
  71. # Perl-compatible testinput1 and testinput4 files are escaped so that they can
  72. # be used for perltest as well as for pcre2test. The output from this script
  73. # should be same as from pcre2test, apart from the initial identifying banner.
  74. #
  75. # The other testinput files are not suitable for feeding to perltest.sh,
  76. # because they make use of the special modifiers that pcre2test uses for
  77. # testing features of PCRE2. Some of these files also contain malformed regular
  78. # expressions, in order to check that PCRE2 diagnoses them correctly.
  79. (echo "$prefix" ; cat <<'PERLEND'
  80. # The alpha assertions currently give warnings even when -w is not specified.
  81. no warnings "experimental::alpha_assertions";
  82. no warnings "experimental::script_run";
  83. # Function for turning a string into a string of printing chars.
  84. sub pchars {
  85. my($t) = "";
  86. if ($utf8)
  87. {
  88. @p = unpack('U*', $_[0]);
  89. foreach $c (@p)
  90. {
  91. if ($c >= 32 && $c < 127) { $t .= chr $c; }
  92. else { $t .= sprintf("\\x{%02x}", $c);
  93. }
  94. }
  95. }
  96. else
  97. {
  98. foreach $c (split(//, $_[0]))
  99. {
  100. if (ord $c >= 32 && ord $c < 127) { $t .= $c; }
  101. else { $t .= sprintf("\\x%02x", ord $c); }
  102. }
  103. }
  104. $t;
  105. }
  106. # Read lines from a named file or stdin and write to a named file or stdout;
  107. # lines consist of a regular expression, in delimiters and optionally followed
  108. # by options, followed by a set of test data, terminated by an empty line.
  109. # Sort out the input and output files
  110. if (@ARGV > 0)
  111. {
  112. open(INFILE, "<$ARGV[0]") || die "Failed to open $ARGV[0]\n";
  113. $infile = "INFILE";
  114. $interact = 0;
  115. }
  116. else
  117. {
  118. open(INFILE, "</dev/tty") || die "Failed to open /dev/tty\n";
  119. $infile = "INFILE";
  120. $interact = 1;
  121. }
  122. if (@ARGV > 1)
  123. {
  124. open(OUTFILE, ">$ARGV[1]") || die "Failed to open $ARGV[1]\n";
  125. $outfile = "OUTFILE";
  126. }
  127. else { $outfile = "STDOUT"; }
  128. printf($outfile "Perl $^V\n\n");
  129. $extra_modifiers = "";
  130. $default_show_mark = 0;
  131. # Main loop
  132. NEXT_RE:
  133. for (;;)
  134. {
  135. printf " re> " if $interact;
  136. last if ! ($_ = <$infile>);
  137. printf $outfile "$_" if ! $interact;
  138. next if ($_ =~ /^\s*$/ || $_ =~ /^#[\s!]/);
  139. # A few of pcre2test's #-commands are supported, or just ignored. Any others
  140. # cause an error.
  141. if ($_ =~ /^#pattern(.*)/)
  142. {
  143. $extra_modifiers = $1;
  144. chomp($extra_modifiers);
  145. $extra_modifiers =~ s/\s+$//;
  146. next;
  147. }
  148. elsif ($_ =~ /^#subject(.*)/)
  149. {
  150. $mod = $1;
  151. chomp($mod);
  152. $mod =~ s/\s+$//;
  153. if ($mod =~ s/(-?)mark,?//)
  154. {
  155. $minus = $1;
  156. $default_show_mark = ($minus =~ /^$/);
  157. }
  158. if ($mod !~ /^\s*$/)
  159. {
  160. printf $outfile "** Warning: \"$mod\" in #subject ignored\n";
  161. }
  162. next;
  163. }
  164. elsif ($_ =~ /^#/)
  165. {
  166. if ($_ !~ /^#newline_default|^#perltest|^#forbid_utf/)
  167. {
  168. printf $outfile "** Warning: #-command ignored: %s", $_;
  169. }
  170. next;
  171. }
  172. $pattern = $_;
  173. while ($pattern !~ /^\s*(.).*\1/s)
  174. {
  175. printf " > " if $interact;
  176. last if ! ($_ = <$infile>);
  177. printf $outfile "$_" if ! $interact;
  178. $pattern .= $_;
  179. }
  180. chomp($pattern);
  181. $pattern =~ s/\s+$//;
  182. # Split the pattern from the modifiers and adjust them as necessary.
  183. $pattern =~ /^\s*((.).*\2)(.*)$/s;
  184. $pat = $1;
  185. $del = $2;
  186. $mod = "$3,$extra_modifiers";
  187. $mod =~ s/^,\s*//;
  188. # The private "aftertext" modifier means "print $' afterwards".
  189. $showrest = ($mod =~ s/aftertext,?//);
  190. # The "subject_literal" modifier disables escapes in subjects.
  191. $subject_literal = ($mod =~ s/subject_literal,?//);
  192. # "allaftertext" is used by pcre2test to print remainders after captures
  193. $mod =~ s/allaftertext,?//;
  194. # Detect utf
  195. $utf8 = $mod =~ s/utf,?//;
  196. # Remove "dupnames".
  197. $mod =~ s/dupnames,?//;
  198. # Remove "jitstack".
  199. $mod =~ s/jitstack=\d+,?//;
  200. # The "mark" modifier requests checking of MARK data */
  201. $show_mark = $default_show_mark | ($mod =~ s/mark,?//);
  202. # "ucp" asks pcre2test to set PCRE2_UCP; change this to /u for Perl
  203. $mod =~ s/ucp,?/u/;
  204. # Remove "no_auto_possess".
  205. $mod =~ s/no_auto_possess,?//;
  206. # Use no_start_optimize (disable PCRE2 start-up optimization) to disable Perl
  207. # optimization by inserting (??{""}) at the start of the pattern. We may
  208. # also encounter -no_start_optimize from a #pattern setting.
  209. $mod =~ s/-no_start_optimize,?//;
  210. if ($mod =~ s/no_start_optimize,?//) { $pat =~ s/$del/$del(??{""})/; }
  211. # Add back retained modifiers and check that the pattern is valid.
  212. $mod =~ s/,//g;
  213. $pattern = "$pat$mod";
  214. eval "\$_ =~ ${pattern}";
  215. if ($@)
  216. {
  217. printf $outfile "Error: $@";
  218. if (! $interact)
  219. {
  220. for (;;)
  221. {
  222. last if ! ($_ = <$infile>);
  223. last if $_ =~ /^\s*$/;
  224. }
  225. }
  226. next NEXT_RE;
  227. }
  228. # If the /g modifier is present, we want to put a loop round the matching;
  229. # otherwise just a single "if".
  230. $cmd = ($pattern =~ /g[a-z]*\s*$/)? "while" : "if";
  231. # If the pattern is actually the null string, Perl uses the most recently
  232. # executed (and successfully compiled) regex is used instead. This is a
  233. # nasty trap for the unwary! The PCRE2 test suite does contain null strings
  234. # in places - if they are allowed through here all sorts of weird and
  235. # unexpected effects happen. To avoid this, we replace such patterns with
  236. # a non-null pattern that has the same effect.
  237. $pattern = "/(?#)/$2" if ($pattern =~ /^(.)\1(.*)$/);
  238. # Read data lines and test them
  239. for (;;)
  240. {
  241. printf "data> " if $interact;
  242. last NEXT_RE if ! ($_ = <$infile>);
  243. chomp;
  244. printf $outfile "%s", "$_\n" if ! $interact;
  245. s/\s+$//; # Remove trailing space
  246. s/^\s+//; # Remove leading space
  247. last if ($_ eq "");
  248. next if $_ =~ /^\\=(?:\s|$)/; # Comment line
  249. if ($subject_literal)
  250. {
  251. $x = $_;
  252. }
  253. else
  254. {
  255. $x = eval "\"$_\""; # To get escapes processed
  256. }
  257. # Empty array for holding results, ensure $REGERROR and $REGMARK are
  258. # unset, then do the matching.
  259. @subs = ();
  260. $pushes = "push \@subs,\$&;" .
  261. "push \@subs,\$1;" .
  262. "push \@subs,\$2;" .
  263. "push \@subs,\$3;" .
  264. "push \@subs,\$4;" .
  265. "push \@subs,\$5;" .
  266. "push \@subs,\$6;" .
  267. "push \@subs,\$7;" .
  268. "push \@subs,\$8;" .
  269. "push \@subs,\$9;" .
  270. "push \@subs,\$10;" .
  271. "push \@subs,\$11;" .
  272. "push \@subs,\$12;" .
  273. "push \@subs,\$13;" .
  274. "push \@subs,\$14;" .
  275. "push \@subs,\$15;" .
  276. "push \@subs,\$16;" .
  277. "push \@subs,\$'; }";
  278. undef $REGERROR;
  279. undef $REGMARK;
  280. eval "${cmd} (\$x =~ ${pattern}) {" . $pushes;
  281. if ($@)
  282. {
  283. printf $outfile "Error: $@\n";
  284. next NEXT_RE;
  285. }
  286. elsif (scalar(@subs) == 0)
  287. {
  288. printf $outfile "No match";
  289. if ($show_mark && defined $REGERROR && $REGERROR != 1)
  290. { printf $outfile (", mark = %s", &pchars($REGERROR)); }
  291. printf $outfile "\n";
  292. }
  293. else
  294. {
  295. while (scalar(@subs) != 0)
  296. {
  297. printf $outfile (" 0: %s\n", &pchars($subs[0]));
  298. printf $outfile (" 0+ %s\n", &pchars($subs[17])) if $showrest;
  299. $last_printed = 0;
  300. for ($i = 1; $i <= 16; $i++)
  301. {
  302. if (defined $subs[$i])
  303. {
  304. while ($last_printed++ < $i-1)
  305. { printf $outfile ("%2d: <unset>\n", $last_printed); }
  306. printf $outfile ("%2d: %s\n", $i, &pchars($subs[$i]));
  307. $last_printed = $i;
  308. }
  309. }
  310. splice(@subs, 0, 18);
  311. }
  312. # It seems that $REGMARK is not marked as UTF-8 even when use utf8 is
  313. # set and the input pattern was a UTF-8 string. We can, however, force
  314. # it to be so marked.
  315. if ($show_mark && defined $REGMARK && $REGMARK != 1)
  316. {
  317. $xx = $REGMARK;
  318. $xx = Encode::decode_utf8($xx) if $utf8;
  319. printf $outfile ("MK: %s\n", &pchars($xx));
  320. }
  321. }
  322. }
  323. }
  324. # By closing OUTFILE explicitly, we avoid a Perl warning in -w mode
  325. # "main::OUTFILE" used only once".
  326. close(OUTFILE) if $outfile eq "OUTFILE";
  327. PERLEND
  328. ) | $perl $perlarg - $@
  329. # End