github的一些开源项目
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

648 lines
10 KiB

  1. # This set of tests is for UTF-8 support and Unicode property support, with
  2. # relevance only for the 8-bit library.
  3. #newline_default lf any anycrlf
  4. # The next 5 patterns have UTF-8 errors
  5. /[�]/utf
  6. /�/utf
  7. /���xxx/utf
  8. /��������/utf
  9. /��������/match_invalid_utf
  10. # Now test subjects
  11. /badutf/utf
  12. \= Expect UTF-8 errors
  13. X\xdf
  14. XX\xef
  15. XXX\xef\x80
  16. X\xf7
  17. XX\xf7\x80
  18. XXX\xf7\x80\x80
  19. \xfb
  20. \xfb\x80
  21. \xfb\x80\x80
  22. \xfb\x80\x80\x80
  23. \xfd
  24. \xfd\x80
  25. \xfd\x80\x80
  26. \xfd\x80\x80\x80
  27. \xfd\x80\x80\x80\x80
  28. \xdf\x7f
  29. \xef\x7f\x80
  30. \xef\x80\x7f
  31. \xf7\x7f\x80\x80
  32. \xf7\x80\x7f\x80
  33. \xf7\x80\x80\x7f
  34. \xfb\x7f\x80\x80\x80
  35. \xfb\x80\x7f\x80\x80
  36. \xfb\x80\x80\x7f\x80
  37. \xfb\x80\x80\x80\x7f
  38. \xfd\x7f\x80\x80\x80\x80
  39. \xfd\x80\x7f\x80\x80\x80
  40. \xfd\x80\x80\x7f\x80\x80
  41. \xfd\x80\x80\x80\x7f\x80
  42. \xfd\x80\x80\x80\x80\x7f
  43. \xed\xa0\x80
  44. \xc0\x8f
  45. \xe0\x80\x8f
  46. \xf0\x80\x80\x8f
  47. \xf8\x80\x80\x80\x8f
  48. \xfc\x80\x80\x80\x80\x8f
  49. \x80
  50. \xfe
  51. \xff
  52. /badutf/utf
  53. \= Expect UTF-8 errors
  54. XX\xfb\x80\x80\x80\x80
  55. XX\xfd\x80\x80\x80\x80\x80
  56. XX\xf7\xbf\xbf\xbf
  57. /shortutf/utf
  58. \= Expect UTF-8 errors
  59. XX\xdf\=ph
  60. XX\xef\=ph
  61. XX\xef\x80\=ph
  62. \xf7\=ph
  63. \xf7\x80\=ph
  64. \xf7\x80\x80\=ph
  65. \xfb\=ph
  66. \xfb\x80\=ph
  67. \xfb\x80\x80\=ph
  68. \xfb\x80\x80\x80\=ph
  69. \xfd\=ph
  70. \xfd\x80\=ph
  71. \xfd\x80\x80\=ph
  72. \xfd\x80\x80\x80\=ph
  73. \xfd\x80\x80\x80\x80\=ph
  74. /anything/utf
  75. \= Expect UTF-8 errors
  76. X\xc0\x80
  77. XX\xc1\x8f
  78. XXX\xe0\x9f\x80
  79. \xf0\x8f\x80\x80
  80. \xf8\x87\x80\x80\x80
  81. \xfc\x83\x80\x80\x80\x80
  82. \xfe\x80\x80\x80\x80\x80
  83. \xff\x80\x80\x80\x80\x80
  84. \xf8\x88\x80\x80\x80
  85. \xf9\x87\x80\x80\x80
  86. \xfc\x84\x80\x80\x80\x80
  87. \xfd\x83\x80\x80\x80\x80
  88. \= Expect no match
  89. \xc3\x8f
  90. \xe0\xaf\x80
  91. \xe1\x80\x80
  92. \xf0\x9f\x80\x80
  93. \xf1\x8f\x80\x80
  94. \xf8\x88\x80\x80\x80\=no_utf_check
  95. \xf9\x87\x80\x80\x80\=no_utf_check
  96. \xfc\x84\x80\x80\x80\x80\=no_utf_check
  97. \xfd\x83\x80\x80\x80\x80\=no_utf_check
  98. # Similar tests with offsets
  99. /badutf/utf
  100. \= Expect UTF-8 errors
  101. X\xdfabcd
  102. X\xdfabcd\=offset=1
  103. \= Expect no match
  104. X\xdfabcd\=offset=2
  105. /(?<=x)badutf/utf
  106. \= Expect UTF-8 errors
  107. X\xdfabcd
  108. X\xdfabcd\=offset=1
  109. X\xdfabcd\=offset=2
  110. X\xdfabcd\xdf\=offset=3
  111. \= Expect no match
  112. X\xdfabcd\=offset=3
  113. /(?<=xx)badutf/utf
  114. \= Expect UTF-8 errors
  115. X\xdfabcd
  116. X\xdfabcd\=offset=1
  117. X\xdfabcd\=offset=2
  118. X\xdfabcd\=offset=3
  119. /(?<=xxxx)badutf/utf
  120. \= Expect UTF-8 errors
  121. X\xdfabcd
  122. X\xdfabcd\=offset=1
  123. X\xdfabcd\=offset=2
  124. X\xdfabcd\=offset=3
  125. X\xdfabc\xdf\=offset=6
  126. X\xdfabc\xdf\=offset=7
  127. \= Expect no match
  128. X\xdfabcd\=offset=6
  129. /\x{100}/IB,utf
  130. /\x{1000}/IB,utf
  131. /\x{10000}/IB,utf
  132. /\x{100000}/IB,utf
  133. /\x{10ffff}/IB,utf
  134. /[\x{ff}]/IB,utf
  135. /[\x{100}]/IB,utf
  136. /\x80/IB,utf
  137. /\xff/IB,utf
  138. /\x{D55c}\x{ad6d}\x{C5B4}/IB,utf
  139. \x{D55c}\x{ad6d}\x{C5B4}
  140. /\x{65e5}\x{672c}\x{8a9e}/IB,utf
  141. \x{65e5}\x{672c}\x{8a9e}
  142. /\x{80}/IB,utf
  143. /\x{084}/IB,utf
  144. /\x{104}/IB,utf
  145. /\x{861}/IB,utf
  146. /\x{212ab}/IB,utf
  147. /[^ab\xC0-\xF0]/IB,utf
  148. \x{f1}
  149. \x{bf}
  150. \x{100}
  151. \x{1000}
  152. \= Expect no match
  153. \x{c0}
  154. \x{f0}
  155. /Ā{3,4}/IB,utf
  156. \x{100}\x{100}\x{100}\x{100\x{100}
  157. /(\x{100}+|x)/IB,utf
  158. /(\x{100}*a|x)/IB,utf
  159. /(\x{100}{0,2}a|x)/IB,utf
  160. /(\x{100}{1,2}a|x)/IB,utf
  161. /\x{100}/IB,utf
  162. /a\x{100}\x{101}*/IB,utf
  163. /a\x{100}\x{101}+/IB,utf
  164. /[^\x{c4}]/IB
  165. /[\x{100}]/IB,utf
  166. \x{100}
  167. Z\x{100}
  168. \x{100}Z
  169. /[\xff]/IB,utf
  170. >\x{ff}<
  171. /[^\xff]/IB,utf
  172. /\x{100}abc(xyz(?1))/IB,utf
  173. /\777/I,utf
  174. \x{1ff}
  175. \777
  176. /\x{100}+\x{200}/IB,utf
  177. /\x{100}+X/IB,utf
  178. /^[\QĀ\E-\QŐ\E/B,utf
  179. # This tests the stricter UTF-8 check according to RFC 3629.
  180. /X/utf
  181. \= Expect UTF-8 errors
  182. \x{d800}
  183. \x{da00}
  184. \x{dfff}
  185. \x{110000}
  186. \x{2000000}
  187. \x{7fffffff}
  188. \= Expect no match
  189. \x{d800}\=no_utf_check
  190. \x{da00}\=no_utf_check
  191. \x{dfff}\=no_utf_check
  192. \x{110000}\=no_utf_check
  193. \x{2000000}\=no_utf_check
  194. \x{7fffffff}\=no_utf_check
  195. /(*UTF8)\x{1234}/
  196. abcd\x{1234}pqr
  197. /(*CRLF)(*UTF)(*BSR_UNICODE)a\Rb/I
  198. /\h/I,utf
  199. ABC\x{09}
  200. ABC\x{20}
  201. ABC\x{a0}
  202. ABC\x{1680}
  203. ABC\x{180e}
  204. ABC\x{2000}
  205. ABC\x{202f}
  206. ABC\x{205f}
  207. ABC\x{3000}
  208. /\v/I,utf
  209. ABC\x{0a}
  210. ABC\x{0b}
  211. ABC\x{0c}
  212. ABC\x{0d}
  213. ABC\x{85}
  214. ABC\x{2028}
  215. /\h*A/I,utf
  216. CDBABC
  217. /\v+A/I,utf
  218. /\s?xxx\s/I,utf
  219. /\sxxx\s/I,utf,tables=2
  220. AB\x{85}xxx\x{a0}XYZ
  221. AB\x{a0}xxx\x{85}XYZ
  222. /\S \S/I,utf,tables=2
  223. \x{a2} \x{84}
  224. A Z
  225. /a+/utf
  226. a\x{123}aa\=offset=1
  227. a\x{123}aa\=offset=3
  228. a\x{123}aa\=offset=4
  229. \= Expect bad offset value
  230. a\x{123}aa\=offset=6
  231. \= Expect bad UTF-8 offset
  232. a\x{123}aa\=offset=2
  233. \= Expect no match
  234. a\x{123}aa\=offset=5
  235. /\x{1234}+/Ii,utf
  236. /\x{1234}+?/Ii,utf
  237. /\x{1234}++/Ii,utf
  238. /\x{1234}{2}/Ii,utf
  239. /[^\x{c4}]/IB,utf
  240. /X+\x{200}/IB,utf
  241. /\R/I,utf
  242. /\777/IB,utf
  243. /\w+\x{C4}/B,utf
  244. a\x{C4}\x{C4}
  245. /\w+\x{C4}/B,utf,tables=2
  246. a\x{C4}\x{C4}
  247. /\W+\x{C4}/B,utf
  248. !\x{C4}
  249. /\W+\x{C4}/B,utf,tables=2
  250. !\x{C4}
  251. /\W+\x{A1}/B,utf
  252. !\x{A1}
  253. /\W+\x{A1}/B,utf,tables=2
  254. !\x{A1}
  255. /X\s+\x{A0}/B,utf
  256. X\x20\x{A0}\x{A0}
  257. /X\s+\x{A0}/B,utf,tables=2
  258. X\x20\x{A0}\x{A0}
  259. /\S+\x{A0}/B,utf
  260. X\x{A0}\x{A0}
  261. /\S+\x{A0}/B,utf,tables=2
  262. X\x{A0}\x{A0}
  263. /\x{a0}+\s!/B,utf
  264. \x{a0}\x20!
  265. /\x{a0}+\s!/B,utf,tables=2
  266. \x{a0}\x20!
  267. /A/utf
  268. \x{ff000041}
  269. \x{7f000041}
  270. /(*UTF8)abc/never_utf
  271. /abc/utf,never_utf
  272. /A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf
  273. /A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf
  274. /AB\x{1fb0}/IB,utf
  275. /AB\x{1fb0}/IBi,utf
  276. /\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf
  277. \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}
  278. \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
  279. /[ⱥ]/Bi,utf
  280. /[^ⱥ]/Bi,utf
  281. /\h/I
  282. /\v/I
  283. /\R/I
  284. /[[:blank:]]/B,ucp
  285. /\x{212a}+/Ii,utf
  286. KKkk\x{212a}
  287. /s+/Ii,utf
  288. SSss\x{17f}
  289. /\x{100}*A/IB,utf
  290. A
  291. /\x{100}*\d(?R)/IB,utf
  292. /[Z\x{100}]/IB,utf
  293. Z\x{100}
  294. \x{100}
  295. \x{100}Z
  296. /[z-\x{100}]/IB,utf
  297. /[z\Qa-d]Ā\E]/IB,utf
  298. \x{100}
  299. Ā
  300. /[ab\x{100}]abc(xyz(?1))/IB,utf
  301. /\x{100}*\s/IB,utf
  302. /\x{100}*\d/IB,utf
  303. /\x{100}*\w/IB,utf
  304. /\x{100}*\D/IB,utf
  305. /\x{100}*\S/IB,utf
  306. /\x{100}*\W/IB,utf
  307. /[\x{105}-\x{109}]/IBi,utf
  308. \x{104}
  309. \x{105}
  310. \x{109}
  311. \= Expect no match
  312. \x{100}
  313. \x{10a}
  314. /[z-\x{100}]/IBi,utf
  315. Z
  316. z
  317. \x{39c}
  318. \x{178}
  319. |
  320. \x{80}
  321. \x{ff}
  322. \x{100}
  323. \x{101}
  324. \= Expect no match
  325. \x{102}
  326. Y
  327. y
  328. /[z-\x{100}]/IBi,utf
  329. /\x{3a3}B/IBi,utf
  330. /abc/utf,replace=�
  331. abc
  332. /(?<=(a)(?-1))x/I,utf
  333. a\x80zx\=offset=3
  334. /[\W\p{Any}]/B
  335. abc
  336. 123
  337. /[\W\pL]/B
  338. abc
  339. \= Expect no match
  340. 123
  341. /(*:*++++++++++++''''''''''''''''''''+''+++'+++x+++++++++++++++++++++++++++++++++++(++++++++++++++++++++:++++++%++:''''''''''''''''''''''''+++++++++++++++++++++++++++++++++++++++++++++++++++++-++++++++k+++++++''''+++'+++++++++++++++++++++++''''++++++++++++':ƿ)/utf
  342. /[\s[:^ascii:]]/B,ucp
  343. # A special extra option allows excaped surrogate code points in 8-bit mode,
  344. # but subjects containing them must not be UTF-checked.
  345. /\x{d800}/I,utf,allow_surrogate_escapes
  346. \x{d800}\=no_utf_check
  347. /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
  348. \x{dfff}\x{df01}\=no_utf_check
  349. # This has different starting code units in 8-bit mode.
  350. /^[^ab]/IB,utf
  351. c
  352. \x{ff}
  353. \x{100}
  354. \= Expect no match
  355. aaa
  356. # Offsets are different in 8-bit mode.
  357. /(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
  358. 123abcáyzabcdef789abcሴqr
  359. # Check name length with non-ASCII characters
  360. /(?'ABáC678901234567890123456789012012345678901234567890123456789AB012345678901234567890123456789AB012345678901234567890123456789AB'...)/utf
  361. /(?'ABáC6789012345678901234567890123012345678901234567890123456789AB012345678901234567890123456789AB012345678901234567890123456789AB'...)/utf
  362. /(?'ABZC6789012345678901234567890123012345678901234567890123456789AB012345678901234567890123456789AB012345678901234567890123456789AB'...)/utf
  363. /(?(n/utf
  364. /(?(á/utf
  365. # Invalid UTF-8 tests
  366. /.../g,match_invalid_utf
  367. abcd\x80wxzy\x80pqrs
  368. abcd\x{80}wxzy\x80pqrs
  369. /abc/match_invalid_utf
  370. ab\x80ab\=ph
  371. \= Expect no match
  372. ab\x80cdef\=ph
  373. /.a/match_invalid_utf
  374. ab\=ph
  375. ab\=ps
  376. b\xf0\x91\x88b\=ph
  377. b\xf0\x91\x88b\=ps
  378. b\xf0\x91\x88\xb4a
  379. \= Expect no match
  380. b\x80\=ph
  381. b\x80\=ps
  382. b\xf0\x91\x88\=ph
  383. b\xf0\x91\x88\=ps
  384. /.a$/match_invalid_utf
  385. ab\=ph
  386. ab\=ps
  387. \= Expect no match
  388. b\xf0\x91\x98\=ph
  389. b\xf0\x91\x98\=ps
  390. /ab$/match_invalid_utf
  391. ab\x80cdeab
  392. \= Expect no match
  393. ab\x80cde
  394. /.../g,match_invalid_utf
  395. abcd\x{80}wxzy\x80pqrs
  396. /(?<=x)../g,match_invalid_utf
  397. abcd\x{80}wxzy\x80pqrs
  398. abcd\x{80}wxzy\x80xpqrs
  399. /X$/match_invalid_utf
  400. \= Expect no match
  401. X\xc4
  402. /(?<=..)X/match_invalid_utf,aftertext
  403. AB\x80AQXYZ
  404. AB\x80AQXYZ\=offset=5
  405. AB\x80\x80AXYZXC\=offset=5
  406. \= Expect no match
  407. AB\x80XYZ
  408. AB\x80XYZ\=offset=3
  409. AB\xfeXYZ
  410. AB\xffXYZ\=offset=3
  411. AB\x80AXYZ
  412. AB\x80AXYZ\=offset=4
  413. AB\x80\x80AXYZ\=offset=5
  414. /.../match_invalid_utf
  415. AB\xc4CCC
  416. \= Expect no match
  417. A\x{d800}B
  418. A\x{110000}B
  419. A\xc4B
  420. /\bX/match_invalid_utf
  421. A\x80X
  422. /\BX/match_invalid_utf
  423. \= Expect no match
  424. A\x80X
  425. /(?<=...)X/match_invalid_utf
  426. AAA\x80BBBXYZ
  427. \= Expect no match
  428. AAA\x80BXYZ
  429. AAA\x80BBXYZ
  430. # -------------------------------------
  431. /(*UTF)(?=\x{123})/I
  432. /[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
  433. /[󿾟,]/BI,utf
  434. /[\x{fff4}-\x{ffff8}]/I,utf
  435. /[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf
  436. /[\xff\x{ffff}]/I,utf
  437. /[\xff\x{ff}]/I,utf
  438. abc\x{ff}def
  439. /[\xff\x{ff}]/I
  440. abc\x{ff}def
  441. /[Ss]/I
  442. /[Ss]/I,utf
  443. /(?:\x{ff}|\x{3000})/I,utf
  444. /x/utf
  445. abxyz
  446. \x80\=startchar
  447. abc\x80\=startchar
  448. abc\x80\=startchar,offset=3
  449. /\x{c1}+\x{e1}/iIB,ucp
  450. \x{c1}\x{c1}\x{c1}
  451. \x{e1}\x{e1}\x{e1}
  452. /a|\x{c1}/iI,ucp
  453. \x{e1}xxx
  454. /a|\x{c1}/iI,utf
  455. \x{e1}xxx
  456. /\x{c1}|\x{e1}/iI,ucp
  457. /X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
  458. X\x{e1}Y
  459. /X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended
  460. X\x{c1}Y
  461. # Without UTF or UCP characters > 127 have only one case in the default locale.
  462. /X(\x{e1})Y/replace=>\U$1<,substitute_extended
  463. X\x{e1}Y
  464. /A/utf,match_invalid_utf,caseless
  465. \xe5A
  466. /\bch\b/utf,match_invalid_utf
  467. qchq\=ph
  468. qchq\=ps
  469. /line1\nbreak/firstline,utf,match_invalid_utf
  470. line1\nbreak
  471. line0\nline1\nbreak
  472. /A\z/utf,match_invalid_utf
  473. A\x80\x42\n
  474. # End of testinput10