github的一些开源项目
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

606 lines
9.8 KiB

  1. # This set of tests is for UTF-16 and UTF-32 support, including Unicode
  2. # properties. It is relevant only to the 16-bit and 32-bit libraries. The
  3. # output is different for each library, so there are separate output files.
  4. /���xxx/IB,utf,no_utf_check
  5. /abc/utf
  6. �]
  7. # Check maximum character size
  8. /\x{ffff}/IB,utf
  9. /\x{10000}/IB,utf
  10. /\x{100}/IB,utf
  11. /\x{1000}/IB,utf
  12. /\x{10000}/IB,utf
  13. /\x{100000}/IB,utf
  14. /\x{10ffff}/IB,utf
  15. /[\x{ff}]/IB,utf
  16. /[\x{100}]/IB,utf
  17. /\x80/IB,utf
  18. /\xff/IB,utf
  19. /\x{D55c}\x{ad6d}\x{C5B4}/IB,utf
  20. \x{D55c}\x{ad6d}\x{C5B4}
  21. /\x{65e5}\x{672c}\x{8a9e}/IB,utf
  22. \x{65e5}\x{672c}\x{8a9e}
  23. /\x{80}/IB,utf
  24. /\x{084}/IB,utf
  25. /\x{104}/IB,utf
  26. /\x{861}/IB,utf
  27. /\x{212ab}/IB,utf
  28. /[^ab\xC0-\xF0]/IB,utf
  29. \x{f1}
  30. \x{bf}
  31. \x{100}
  32. \x{1000}
  33. \= Expect no match
  34. \x{c0}
  35. \x{f0}
  36. /Ā{3,4}/IB,utf
  37. \x{100}\x{100}\x{100}\x{100\x{100}
  38. /(\x{100}+|x)/IB,utf
  39. /(\x{100}*a|x)/IB,utf
  40. /(\x{100}{0,2}a|x)/IB,utf
  41. /(\x{100}{1,2}a|x)/IB,utf
  42. /\x{100}/IB,utf
  43. /a\x{100}\x{101}*/IB,utf
  44. /a\x{100}\x{101}+/IB,utf
  45. /[^\x{c4}]/IB
  46. /[\x{100}]/IB,utf
  47. \x{100}
  48. Z\x{100}
  49. \x{100}Z
  50. /[\xff]/IB,utf
  51. >\x{ff}<
  52. /[^\xff]/IB,utf
  53. /\x{100}abc(xyz(?1))/IB,utf
  54. /\777/I,utf
  55. \x{1ff}
  56. \777
  57. /\x{100}+\x{200}/IB,utf
  58. /\x{100}+X/IB,utf
  59. /^[\QĀ\E-\QŐ\E/B,utf
  60. /X/utf
  61. XX\x{d800}\=no_utf_check
  62. XX\x{da00}\=no_utf_check
  63. XX\x{dc00}\=no_utf_check
  64. XX\x{de00}\=no_utf_check
  65. XX\x{dfff}\=no_utf_check
  66. \= Expect UTF error
  67. XX\x{d800}
  68. XX\x{da00}
  69. XX\x{dc00}
  70. XX\x{de00}
  71. XX\x{dfff}
  72. XX\x{110000}
  73. XX\x{d800}\x{1234}
  74. \= Expect no match
  75. XX\x{d800}\=offset=3
  76. /(?<=.)X/utf
  77. XX\x{d800}\=offset=3
  78. /(*UTF16)\x{11234}/
  79. abcd\x{11234}pqr
  80. /(*UTF)\x{11234}/I
  81. abcd\x{11234}pqr
  82. /(*UTF-32)\x{11234}/
  83. abcd\x{11234}pqr
  84. /(*UTF-32)\x{112}/
  85. abcd\x{11234}pqr
  86. /(*CRLF)(*UTF16)(*BSR_UNICODE)a\Rb/I
  87. /(*CRLF)(*UTF32)(*BSR_UNICODE)a\Rb/I
  88. /\h/I,utf
  89. ABC\x{09}
  90. ABC\x{20}
  91. ABC\x{a0}
  92. ABC\x{1680}
  93. ABC\x{180e}
  94. ABC\x{2000}
  95. ABC\x{202f}
  96. ABC\x{205f}
  97. ABC\x{3000}
  98. /\v/I,utf
  99. ABC\x{0a}
  100. ABC\x{0b}
  101. ABC\x{0c}
  102. ABC\x{0d}
  103. ABC\x{85}
  104. ABC\x{2028}
  105. /\h*A/I,utf
  106. CDBABC
  107. \x{2000}ABC
  108. /\R*A/I,bsr=unicode,utf
  109. CDBABC
  110. \x{2028}A
  111. /\v+A/I,utf
  112. /\s?xxx\s/I,utf
  113. /\sxxx\s/I,utf,tables=2
  114. AB\x{85}xxx\x{a0}XYZ
  115. AB\x{a0}xxx\x{85}XYZ
  116. /\S \S/I,utf,tables=2
  117. \x{a2} \x{84}
  118. A Z
  119. /a+/utf
  120. a\x{123}aa\=offset=1
  121. a\x{123}aa\=offset=2
  122. a\x{123}aa\=offset=3
  123. \= Expect no match
  124. a\x{123}aa\=offset=4
  125. \= Expect bad offset error
  126. a\x{123}aa\=offset=5
  127. a\x{123}aa\=offset=6
  128. /\x{1234}+/Ii,utf
  129. /\x{1234}+?/Ii,utf
  130. /\x{1234}++/Ii,utf
  131. /\x{1234}{2}/Ii,utf
  132. /[^\x{c4}]/IB,utf
  133. /X+\x{200}/IB,utf
  134. /\R/I,utf
  135. # Check bad offset
  136. /a/utf
  137. \= Expect bad UTF-16 offset, or no match in 32-bit
  138. \x{10000}\=offset=1
  139. \x{10000}ab\=offset=1
  140. \= Expect 16-bit match, 32-bit no match
  141. \x{10000}ab\=offset=2
  142. \= Expect no match
  143. \x{10000}ab\=offset=3
  144. \= Expect no match in 16-bit, bad offset in 32-bit
  145. \x{10000}ab\=offset=4
  146. \= Expect bad offset
  147. \x{10000}ab\=offset=5
  148. /���/utf
  149. /\w+\x{C4}/B,utf
  150. a\x{C4}\x{C4}
  151. /\w+\x{C4}/B,utf,tables=2
  152. a\x{C4}\x{C4}
  153. /\W+\x{C4}/B,utf
  154. !\x{C4}
  155. /\W+\x{C4}/B,utf,tables=2
  156. !\x{C4}
  157. /\W+\x{A1}/B,utf
  158. !\x{A1}
  159. /\W+\x{A1}/B,utf,tables=2
  160. !\x{A1}
  161. /X\s+\x{A0}/B,utf
  162. X\x20\x{A0}\x{A0}
  163. /X\s+\x{A0}/B,utf,tables=2
  164. X\x20\x{A0}\x{A0}
  165. /\S+\x{A0}/B,utf
  166. X\x{A0}\x{A0}
  167. /\S+\x{A0}/B,utf,tables=2
  168. X\x{A0}\x{A0}
  169. /\x{a0}+\s!/B,utf
  170. \x{a0}\x20!
  171. /\x{a0}+\s!/B,utf,tables=2
  172. \x{a0}\x20!
  173. /(*UTF)abc/never_utf
  174. /abc/utf,never_utf
  175. /A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf
  176. /A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf
  177. /AB\x{1fb0}/IB,utf
  178. /AB\x{1fb0}/IBi,utf
  179. /\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf
  180. \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}
  181. \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
  182. /[ⱥ]/Bi,utf
  183. /[^ⱥ]/Bi,utf
  184. /[[:blank:]]/B,ucp
  185. /\x{212a}+/Ii,utf
  186. KKkk\x{212a}
  187. /s+/Ii,utf
  188. SSss\x{17f}
  189. # Non-UTF characters should give errors in both 16-bit and 32-bit modes.
  190. /\x{110000}/utf
  191. /\o{4200000}/utf
  192. /\x{100}*A/IB,utf
  193. A
  194. /\x{100}*\d(?R)/IB,utf
  195. /[Z\x{100}]/IB,utf
  196. Z\x{100}
  197. \x{100}
  198. \x{100}Z
  199. /[z-\x{100}]/IB,utf
  200. /[z\Qa-d]Ā\E]/IB,utf
  201. \x{100}
  202. Ā
  203. /[ab\x{100}]abc(xyz(?1))/IB,utf
  204. /\x{100}*\s/IB,utf
  205. /\x{100}*\d/IB,utf
  206. /\x{100}*\w/IB,utf
  207. /\x{100}*\D/IB,utf
  208. /\x{100}*\S/IB,utf
  209. /\x{100}*\W/IB,utf
  210. /[\x{105}-\x{109}]/IBi,utf
  211. \x{104}
  212. \x{105}
  213. \x{109}
  214. \= Expect no match
  215. \x{100}
  216. \x{10a}
  217. /[z-\x{100}]/IBi,utf
  218. Z
  219. z
  220. \x{39c}
  221. \x{178}
  222. |
  223. \x{80}
  224. \x{ff}
  225. \x{100}
  226. \x{101}
  227. \= Expect no match
  228. \x{102}
  229. Y
  230. y
  231. /[z-\x{100}]/IBi,utf
  232. /\x{3a3}B/IBi,utf
  233. /./utf
  234. \x{110000}
  235. /(*UTF)ab������z/B
  236. /ab������z/utf
  237. /[\W\p{Any}]/B
  238. abc
  239. 123
  240. /[\W\pL]/B
  241. abc
  242. \x{100}
  243. \x{308}
  244. \= Expect no match
  245. 123
  246. /[\s[:^ascii:]]/B,ucp
  247. /\pP/ucp
  248. \x{7fffffff}
  249. # A special extra option allows excaped surrogate code points in 32-bit mode,
  250. # but subjects containing them must not be UTF-checked. These patterns give
  251. # errors in 16-bit mode.
  252. /\x{d800}/I,utf,allow_surrogate_escapes
  253. \x{d800}\=no_utf_check
  254. /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
  255. \x{dfff}\x{df01}\=no_utf_check
  256. # This has different starting code units in 8-bit mode.
  257. /^[^ab]/IB,utf
  258. c
  259. \x{ff}
  260. \x{100}
  261. \= Expect no match
  262. aaa
  263. # Offsets are different in 8-bit mode.
  264. /(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
  265. 123abcáyzabcdef789abcሴqr
  266. # A few script run tests in non-UTF mode (but they need Unicode support)
  267. /^(*script_run:.{4})/
  268. \x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han
  269. \x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han
  270. \x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul
  271. /^(*sr:.*)/utf,allow_surrogate_escapes
  272. \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
  273. \x{d800}\x{dfff} Surrogates (Unknown) \=no_utf_check
  274. /(?(n/utf
  275. /(?(á/utf
  276. # Invalid UTF-16/32 tests.
  277. /.../g,match_invalid_utf
  278. abcd\x{df00}wxzy\x{df00}pqrs
  279. abcd\x{80}wxzy\x{df00}pqrs
  280. /abc/match_invalid_utf
  281. ab\x{df00}ab\=ph
  282. \= Expect no match
  283. ab\x{df00}cdef\=ph
  284. /.a/match_invalid_utf
  285. ab\=ph
  286. ab\=ps
  287. \= Expect no match
  288. b\x{df00}\=ph
  289. b\x{df00}\=ps
  290. /.a$/match_invalid_utf
  291. ab\=ph
  292. ab\=ps
  293. \= Expect no match
  294. b\x{df00}\=ph
  295. b\x{df00}\=ps
  296. /ab$/match_invalid_utf
  297. ab\x{df00}cdeab
  298. \= Expect no match
  299. ab\x{df00}cde
  300. /.../g,match_invalid_utf
  301. abcd\x{80}wxzy\x{df00}pqrs
  302. /(?<=x)../g,match_invalid_utf
  303. abcd\x{80}wxzy\x{df00}pqrs
  304. abcd\x{80}wxzy\x{df00}xpqrs
  305. /X$/match_invalid_utf
  306. \= Expect no match
  307. X\x{df00}
  308. /(?<=..)X/match_invalid_utf,aftertext
  309. AB\x{df00}AQXYZ
  310. AB\x{df00}AQXYZ\=offset=5
  311. AB\x{df00}\x{df00}AXYZXC\=offset=5
  312. \= Expect no match
  313. AB\x{df00}XYZ
  314. AB\x{df00}XYZ\=offset=3
  315. AB\x{df00}AXYZ
  316. AB\x{df00}AXYZ\=offset=4
  317. AB\x{df00}\x{df00}AXYZ\=offset=5
  318. /.../match_invalid_utf
  319. \= Expect no match
  320. A\x{d800}B
  321. A\x{110000}B
  322. /aa/utf,ucp,match_invalid_utf,global
  323. aa\x{d800}aa
  324. /aa/utf,ucp,match_invalid_utf,global
  325. \x{d800}aa
  326. /A\z/utf,match_invalid_utf
  327. A\x{df00}\n
  328. # ----------------------------------------------------
  329. /(*UTF)(?=\x{123})/I
  330. /[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
  331. /[\xff\x{ffff}]/I,utf
  332. /[\xff\x{ff}]/I,utf
  333. /[\xff\x{ff}]/I
  334. /[Ss]/I
  335. /[Ss]/I,utf
  336. /(?:\x{ff}|\x{3000})/I,utf
  337. # ----------------------------------------------------
  338. # UCP and casing tests
  339. /\x{120}/i,I
  340. /\x{c1}/i,I,ucp
  341. /[\x{120}\x{121}]/iB,ucp
  342. /[ab\x{120}]+/iB,ucp
  343. aABb\x{121}\x{120}
  344. /\x{c1}/i,no_start_optimize
  345. \= Expect no match
  346. \x{e1}
  347. /\x{120}\x{c1}/i,ucp,no_start_optimize
  348. \x{121}\x{e1}
  349. /\x{120}\x{c1}/i,ucp
  350. \x{121}\x{e1}
  351. /[^\x{120}]/i,no_start_optimize
  352. \x{121}
  353. /[^\x{120}]/i,ucp,no_start_optimize
  354. \= Expect no match
  355. \x{121}
  356. /[^\x{120}]/i
  357. \x{121}
  358. /[^\x{120}]/i,ucp
  359. \= Expect no match
  360. \x{121}
  361. /\x{120}{2}/i,ucp
  362. \x{121}\x{121}
  363. /[^\x{120}]{2}/i,ucp
  364. \= Expect no match
  365. \x{121}\x{121}
  366. /\x{c1}+\x{e1}/iB,ucp
  367. \x{c1}\x{c1}\x{c1}
  368. /\x{c1}+\x{e1}/iIB,ucp
  369. \x{c1}\x{c1}\x{c1}
  370. \x{e1}\x{e1}\x{e1}
  371. /a|\x{c1}/iI,ucp
  372. \x{e1}xxx
  373. /\x{c1}|\x{e1}/iI,ucp
  374. /X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
  375. X\x{e1}Y
  376. /X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
  377. X\x{121}Y
  378. /s/i,ucp
  379. \x{17f}
  380. /s/i,utf
  381. \x{17f}
  382. /[^s]/i,ucp
  383. \= Expect no match
  384. \x{17f}
  385. /[^s]/i,utf
  386. \= Expect no match
  387. \x{17f}
  388. # ----------------------------------------------------
  389. # Quantifier after a literal that has the value of META_ACCEPT (not UTF). This
  390. # fails in 16-bit mode, but is OK for 32-bit.
  391. /\x{802a0000}*/
  392. \x{802a0000}\x{802a0000}
  393. # UTF matching without UTF, check invalid UTF characters
  394. /\X++/
  395. a\x{110000}\x{ffffffff}
  396. # This used to loop in 32-bit mode; it will fail in 16-bit mode.
  397. /[\x{ffffffff}]/caseless,ucp
  398. \x{ffffffff}xyz
  399. # These are 32-bit tests for handing 0xffffffff when in UCP caselsss mode. They
  400. # will give errors in 16-bit mode.
  401. /k*\x{ffffffff}/caseless,ucp
  402. \x{ffffffff}
  403. /k+\x{ffffffff}/caseless,ucp,no_start_optimize
  404. K\x{ffffffff}
  405. \= Expect no match
  406. \x{ffffffff}\x{ffffffff}
  407. /k{2}\x{ffffffff}/caseless,ucp,no_start_optimize
  408. \= Expect no match
  409. \x{ffffffff}\x{ffffffff}\x{ffffffff}
  410. /k\x{ffffffff}/caseless,ucp,no_start_optimize
  411. K\x{ffffffff}
  412. \= Expect no match
  413. \x{ffffffff}\x{ffffffff}\x{ffffffff}
  414. /k{2,}?Z/caseless,ucp,no_start_optimize,no_auto_possess
  415. \= Expect no match
  416. Kk\x{ffffffff}\x{ffffffff}\x{ffffffff}Z
  417. # ---------------------------------------------------------
  418. # End of testinput12