github的一些开源项目
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1868 lines
49 KiB

  1. # This set of tests is for UTF-16 and UTF-32 support, including Unicode
  2. # properties. It is relevant only to the 16-bit and 32-bit libraries. The
  3. # output is different for each library, so there are separate output files.
  4. /���xxx/IB,utf,no_utf_check
  5. ** Failed: invalid UTF-8 string cannot be converted to 16-bit string
  6. /abc/utf
  7. �]
  8. ** Failed: invalid UTF-8 string cannot be used as input in UTF mode
  9. # Check maximum character size
  10. /\x{ffff}/IB,utf
  11. ------------------------------------------------------------------
  12. Bra
  13. \x{ffff}
  14. Ket
  15. End
  16. ------------------------------------------------------------------
  17. Capture group count = 0
  18. Options: utf
  19. First code unit = \x{ffff}
  20. Subject length lower bound = 1
  21. /\x{10000}/IB,utf
  22. ------------------------------------------------------------------
  23. Bra
  24. \x{10000}
  25. Ket
  26. End
  27. ------------------------------------------------------------------
  28. Capture group count = 0
  29. Options: utf
  30. First code unit = \x{d800}
  31. Last code unit = \x{dc00}
  32. Subject length lower bound = 1
  33. /\x{100}/IB,utf
  34. ------------------------------------------------------------------
  35. Bra
  36. \x{100}
  37. Ket
  38. End
  39. ------------------------------------------------------------------
  40. Capture group count = 0
  41. Options: utf
  42. First code unit = \x{100}
  43. Subject length lower bound = 1
  44. /\x{1000}/IB,utf
  45. ------------------------------------------------------------------
  46. Bra
  47. \x{1000}
  48. Ket
  49. End
  50. ------------------------------------------------------------------
  51. Capture group count = 0
  52. Options: utf
  53. First code unit = \x{1000}
  54. Subject length lower bound = 1
  55. /\x{10000}/IB,utf
  56. ------------------------------------------------------------------
  57. Bra
  58. \x{10000}
  59. Ket
  60. End
  61. ------------------------------------------------------------------
  62. Capture group count = 0
  63. Options: utf
  64. First code unit = \x{d800}
  65. Last code unit = \x{dc00}
  66. Subject length lower bound = 1
  67. /\x{100000}/IB,utf
  68. ------------------------------------------------------------------
  69. Bra
  70. \x{100000}
  71. Ket
  72. End
  73. ------------------------------------------------------------------
  74. Capture group count = 0
  75. Options: utf
  76. First code unit = \x{dbc0}
  77. Last code unit = \x{dc00}
  78. Subject length lower bound = 1
  79. /\x{10ffff}/IB,utf
  80. ------------------------------------------------------------------
  81. Bra
  82. \x{10ffff}
  83. Ket
  84. End
  85. ------------------------------------------------------------------
  86. Capture group count = 0
  87. Options: utf
  88. First code unit = \x{dbff}
  89. Last code unit = \x{dfff}
  90. Subject length lower bound = 1
  91. /[\x{ff}]/IB,utf
  92. ------------------------------------------------------------------
  93. Bra
  94. \x{ff}
  95. Ket
  96. End
  97. ------------------------------------------------------------------
  98. Capture group count = 0
  99. Options: utf
  100. First code unit = \xff
  101. Subject length lower bound = 1
  102. /[\x{100}]/IB,utf
  103. ------------------------------------------------------------------
  104. Bra
  105. \x{100}
  106. Ket
  107. End
  108. ------------------------------------------------------------------
  109. Capture group count = 0
  110. Options: utf
  111. First code unit = \x{100}
  112. Subject length lower bound = 1
  113. /\x80/IB,utf
  114. ------------------------------------------------------------------
  115. Bra
  116. \x{80}
  117. Ket
  118. End
  119. ------------------------------------------------------------------
  120. Capture group count = 0
  121. Options: utf
  122. First code unit = \x80
  123. Subject length lower bound = 1
  124. /\xff/IB,utf
  125. ------------------------------------------------------------------
  126. Bra
  127. \x{ff}
  128. Ket
  129. End
  130. ------------------------------------------------------------------
  131. Capture group count = 0
  132. Options: utf
  133. First code unit = \xff
  134. Subject length lower bound = 1
  135. /\x{D55c}\x{ad6d}\x{C5B4}/IB,utf
  136. ------------------------------------------------------------------
  137. Bra
  138. \x{d55c}\x{ad6d}\x{c5b4}
  139. Ket
  140. End
  141. ------------------------------------------------------------------
  142. Capture group count = 0
  143. Options: utf
  144. First code unit = \x{d55c}
  145. Last code unit = \x{c5b4}
  146. Subject length lower bound = 3
  147. \x{D55c}\x{ad6d}\x{C5B4}
  148. 0: \x{d55c}\x{ad6d}\x{c5b4}
  149. /\x{65e5}\x{672c}\x{8a9e}/IB,utf
  150. ------------------------------------------------------------------
  151. Bra
  152. \x{65e5}\x{672c}\x{8a9e}
  153. Ket
  154. End
  155. ------------------------------------------------------------------
  156. Capture group count = 0
  157. Options: utf
  158. First code unit = \x{65e5}
  159. Last code unit = \x{8a9e}
  160. Subject length lower bound = 3
  161. \x{65e5}\x{672c}\x{8a9e}
  162. 0: \x{65e5}\x{672c}\x{8a9e}
  163. /\x{80}/IB,utf
  164. ------------------------------------------------------------------
  165. Bra
  166. \x{80}
  167. Ket
  168. End
  169. ------------------------------------------------------------------
  170. Capture group count = 0
  171. Options: utf
  172. First code unit = \x80
  173. Subject length lower bound = 1
  174. /\x{084}/IB,utf
  175. ------------------------------------------------------------------
  176. Bra
  177. \x{84}
  178. Ket
  179. End
  180. ------------------------------------------------------------------
  181. Capture group count = 0
  182. Options: utf
  183. First code unit = \x84
  184. Subject length lower bound = 1
  185. /\x{104}/IB,utf
  186. ------------------------------------------------------------------
  187. Bra
  188. \x{104}
  189. Ket
  190. End
  191. ------------------------------------------------------------------
  192. Capture group count = 0
  193. Options: utf
  194. First code unit = \x{104}
  195. Subject length lower bound = 1
  196. /\x{861}/IB,utf
  197. ------------------------------------------------------------------
  198. Bra
  199. \x{861}
  200. Ket
  201. End
  202. ------------------------------------------------------------------
  203. Capture group count = 0
  204. Options: utf
  205. First code unit = \x{861}
  206. Subject length lower bound = 1
  207. /\x{212ab}/IB,utf
  208. ------------------------------------------------------------------
  209. Bra
  210. \x{212ab}
  211. Ket
  212. End
  213. ------------------------------------------------------------------
  214. Capture group count = 0
  215. Options: utf
  216. First code unit = \x{d844}
  217. Last code unit = \x{deab}
  218. Subject length lower bound = 1
  219. /[^ab\xC0-\xF0]/IB,utf
  220. ------------------------------------------------------------------
  221. Bra
  222. [\x00-`c-\xbf\xf1-\xff] (neg)
  223. Ket
  224. End
  225. ------------------------------------------------------------------
  226. Capture group count = 0
  227. Options: utf
  228. Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
  229. \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
  230. \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4
  231. 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y
  232. Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f
  233. \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e
  234. \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d
  235. \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac
  236. \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb
  237. \xbc \xbd \xbe \xbf \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb
  238. \xfc \xfd \xfe \xff
  239. Subject length lower bound = 1
  240. \x{f1}
  241. 0: \x{f1}
  242. \x{bf}
  243. 0: \x{bf}
  244. \x{100}
  245. 0: \x{100}
  246. \x{1000}
  247. 0: \x{1000}
  248. \= Expect no match
  249. \x{c0}
  250. No match
  251. \x{f0}
  252. No match
  253. /Ā{3,4}/IB,utf
  254. ------------------------------------------------------------------
  255. Bra
  256. \x{100}{3}
  257. \x{100}?+
  258. Ket
  259. End
  260. ------------------------------------------------------------------
  261. Capture group count = 0
  262. Options: utf
  263. First code unit = \x{100}
  264. Last code unit = \x{100}
  265. Subject length lower bound = 3
  266. \x{100}\x{100}\x{100}\x{100\x{100}
  267. 0: \x{100}\x{100}\x{100}
  268. /(\x{100}+|x)/IB,utf
  269. ------------------------------------------------------------------
  270. Bra
  271. CBra 1
  272. \x{100}++
  273. Alt
  274. x
  275. Ket
  276. Ket
  277. End
  278. ------------------------------------------------------------------
  279. Capture group count = 1
  280. Options: utf
  281. Starting code units: x \xff
  282. Subject length lower bound = 1
  283. /(\x{100}*a|x)/IB,utf
  284. ------------------------------------------------------------------
  285. Bra
  286. CBra 1
  287. \x{100}*+
  288. a
  289. Alt
  290. x
  291. Ket
  292. Ket
  293. End
  294. ------------------------------------------------------------------
  295. Capture group count = 1
  296. Options: utf
  297. Starting code units: a x \xff
  298. Subject length lower bound = 1
  299. /(\x{100}{0,2}a|x)/IB,utf
  300. ------------------------------------------------------------------
  301. Bra
  302. CBra 1
  303. \x{100}{0,2}+
  304. a
  305. Alt
  306. x
  307. Ket
  308. Ket
  309. End
  310. ------------------------------------------------------------------
  311. Capture group count = 1
  312. Options: utf
  313. Starting code units: a x \xff
  314. Subject length lower bound = 1
  315. /(\x{100}{1,2}a|x)/IB,utf
  316. ------------------------------------------------------------------
  317. Bra
  318. CBra 1
  319. \x{100}
  320. \x{100}{0,1}+
  321. a
  322. Alt
  323. x
  324. Ket
  325. Ket
  326. End
  327. ------------------------------------------------------------------
  328. Capture group count = 1
  329. Options: utf
  330. Starting code units: x \xff
  331. Subject length lower bound = 1
  332. /\x{100}/IB,utf
  333. ------------------------------------------------------------------
  334. Bra
  335. \x{100}
  336. Ket
  337. End
  338. ------------------------------------------------------------------
  339. Capture group count = 0
  340. Options: utf
  341. First code unit = \x{100}
  342. Subject length lower bound = 1
  343. /a\x{100}\x{101}*/IB,utf
  344. ------------------------------------------------------------------
  345. Bra
  346. a\x{100}
  347. \x{101}*+
  348. Ket
  349. End
  350. ------------------------------------------------------------------
  351. Capture group count = 0
  352. Options: utf
  353. First code unit = 'a'
  354. Last code unit = \x{100}
  355. Subject length lower bound = 2
  356. /a\x{100}\x{101}+/IB,utf
  357. ------------------------------------------------------------------
  358. Bra
  359. a\x{100}
  360. \x{101}++
  361. Ket
  362. End
  363. ------------------------------------------------------------------
  364. Capture group count = 0
  365. Options: utf
  366. First code unit = 'a'
  367. Last code unit = \x{101}
  368. Subject length lower bound = 3
  369. /[^\x{c4}]/IB
  370. ------------------------------------------------------------------
  371. Bra
  372. [^\x{c4}]
  373. Ket
  374. End
  375. ------------------------------------------------------------------
  376. Capture group count = 0
  377. Subject length lower bound = 1
  378. /[\x{100}]/IB,utf
  379. ------------------------------------------------------------------
  380. Bra
  381. \x{100}
  382. Ket
  383. End
  384. ------------------------------------------------------------------
  385. Capture group count = 0
  386. Options: utf
  387. First code unit = \x{100}
  388. Subject length lower bound = 1
  389. \x{100}
  390. 0: \x{100}
  391. Z\x{100}
  392. 0: \x{100}
  393. \x{100}Z
  394. 0: \x{100}
  395. /[\xff]/IB,utf
  396. ------------------------------------------------------------------
  397. Bra
  398. \x{ff}
  399. Ket
  400. End
  401. ------------------------------------------------------------------
  402. Capture group count = 0
  403. Options: utf
  404. First code unit = \xff
  405. Subject length lower bound = 1
  406. >\x{ff}<
  407. 0: \x{ff}
  408. /[^\xff]/IB,utf
  409. ------------------------------------------------------------------
  410. Bra
  411. [^\x{ff}]
  412. Ket
  413. End
  414. ------------------------------------------------------------------
  415. Capture group count = 0
  416. Options: utf
  417. Subject length lower bound = 1
  418. /\x{100}abc(xyz(?1))/IB,utf
  419. ------------------------------------------------------------------
  420. Bra
  421. \x{100}abc
  422. CBra 1
  423. xyz
  424. Recurse
  425. Ket
  426. Ket
  427. End
  428. ------------------------------------------------------------------
  429. Capture group count = 1
  430. Options: utf
  431. First code unit = \x{100}
  432. Last code unit = 'z'
  433. Subject length lower bound = 7
  434. /\777/I,utf
  435. Capture group count = 0
  436. Options: utf
  437. First code unit = \x{1ff}
  438. Subject length lower bound = 1
  439. \x{1ff}
  440. 0: \x{1ff}
  441. \777
  442. 0: \x{1ff}
  443. /\x{100}+\x{200}/IB,utf
  444. ------------------------------------------------------------------
  445. Bra
  446. \x{100}++
  447. \x{200}
  448. Ket
  449. End
  450. ------------------------------------------------------------------
  451. Capture group count = 0
  452. Options: utf
  453. First code unit = \x{100}
  454. Last code unit = \x{200}
  455. Subject length lower bound = 2
  456. /\x{100}+X/IB,utf
  457. ------------------------------------------------------------------
  458. Bra
  459. \x{100}++
  460. X
  461. Ket
  462. End
  463. ------------------------------------------------------------------
  464. Capture group count = 0
  465. Options: utf
  466. First code unit = \x{100}
  467. Last code unit = 'X'
  468. Subject length lower bound = 2
  469. /^[\QĀ\E-\QŐ\E/B,utf
  470. Failed: error 106 at offset 13: missing terminating ] for character class
  471. /X/utf
  472. XX\x{d800}\=no_utf_check
  473. 0: X
  474. XX\x{da00}\=no_utf_check
  475. 0: X
  476. XX\x{dc00}\=no_utf_check
  477. 0: X
  478. XX\x{de00}\=no_utf_check
  479. 0: X
  480. XX\x{dfff}\=no_utf_check
  481. 0: X
  482. \= Expect UTF error
  483. XX\x{d800}
  484. Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
  485. XX\x{da00}
  486. Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
  487. XX\x{dc00}
  488. Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
  489. XX\x{de00}
  490. Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
  491. XX\x{dfff}
  492. Failed: error -26: UTF-16 error: isolated low surrogate at offset 2
  493. XX\x{110000}
  494. ** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
  495. XX\x{d800}\x{1234}
  496. Failed: error -25: UTF-16 error: invalid low surrogate at offset 2
  497. \= Expect no match
  498. XX\x{d800}\=offset=3
  499. No match
  500. /(?<=.)X/utf
  501. XX\x{d800}\=offset=3
  502. Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
  503. /(*UTF16)\x{11234}/
  504. abcd\x{11234}pqr
  505. 0: \x{11234}
  506. /(*UTF)\x{11234}/I
  507. Capture group count = 0
  508. Compile options: <none>
  509. Overall options: utf
  510. First code unit = \x{d804}
  511. Last code unit = \x{de34}
  512. Subject length lower bound = 1
  513. abcd\x{11234}pqr
  514. 0: \x{11234}
  515. /(*UTF-32)\x{11234}/
  516. Failed: error 160 at offset 5: (*VERB) not recognized or malformed
  517. abcd\x{11234}pqr
  518. /(*UTF-32)\x{112}/
  519. Failed: error 160 at offset 5: (*VERB) not recognized or malformed
  520. abcd\x{11234}pqr
  521. /(*CRLF)(*UTF16)(*BSR_UNICODE)a\Rb/I
  522. Capture group count = 0
  523. Compile options: <none>
  524. Overall options: utf
  525. \R matches any Unicode newline
  526. Forced newline is CRLF
  527. First code unit = 'a'
  528. Last code unit = 'b'
  529. Subject length lower bound = 3
  530. /(*CRLF)(*UTF32)(*BSR_UNICODE)a\Rb/I
  531. Failed: error 160 at offset 14: (*VERB) not recognized or malformed
  532. /\h/I,utf
  533. Capture group count = 0
  534. Options: utf
  535. Starting code units: \x09 \x20 \xa0 \xff
  536. Subject length lower bound = 1
  537. ABC\x{09}
  538. 0: \x{09}
  539. ABC\x{20}
  540. 0:
  541. ABC\x{a0}
  542. 0: \x{a0}
  543. ABC\x{1680}
  544. 0: \x{1680}
  545. ABC\x{180e}
  546. 0: \x{180e}
  547. ABC\x{2000}
  548. 0: \x{2000}
  549. ABC\x{202f}
  550. 0: \x{202f}
  551. ABC\x{205f}
  552. 0: \x{205f}
  553. ABC\x{3000}
  554. 0: \x{3000}
  555. /\v/I,utf
  556. Capture group count = 0
  557. Options: utf
  558. Starting code units: \x0a \x0b \x0c \x0d \x85 \xff
  559. Subject length lower bound = 1
  560. ABC\x{0a}
  561. 0: \x{0a}
  562. ABC\x{0b}
  563. 0: \x{0b}
  564. ABC\x{0c}
  565. 0: \x{0c}
  566. ABC\x{0d}
  567. 0: \x{0d}
  568. ABC\x{85}
  569. 0: \x{85}
  570. ABC\x{2028}
  571. 0: \x{2028}
  572. /\h*A/I,utf
  573. Capture group count = 0
  574. Options: utf
  575. Starting code units: \x09 \x20 A \xa0 \xff
  576. Last code unit = 'A'
  577. Subject length lower bound = 1
  578. CDBABC
  579. 0: A
  580. \x{2000}ABC
  581. 0: \x{2000}A
  582. /\R*A/I,bsr=unicode,utf
  583. Capture group count = 0
  584. Options: utf
  585. \R matches any Unicode newline
  586. Starting code units: \x0a \x0b \x0c \x0d A \x85 \xff
  587. Last code unit = 'A'
  588. Subject length lower bound = 1
  589. CDBABC
  590. 0: A
  591. \x{2028}A
  592. 0: \x{2028}A
  593. /\v+A/I,utf
  594. Capture group count = 0
  595. Options: utf
  596. Starting code units: \x0a \x0b \x0c \x0d \x85 \xff
  597. Last code unit = 'A'
  598. Subject length lower bound = 2
  599. /\s?xxx\s/I,utf
  600. Capture group count = 0
  601. Options: utf
  602. Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 x
  603. Last code unit = 'x'
  604. Subject length lower bound = 4
  605. /\sxxx\s/I,utf,tables=2
  606. Capture group count = 0
  607. Options: utf
  608. Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 \x85 \xa0
  609. Last code unit = 'x'
  610. Subject length lower bound = 5
  611. AB\x{85}xxx\x{a0}XYZ
  612. 0: \x{85}xxx\x{a0}
  613. AB\x{a0}xxx\x{85}XYZ
  614. 0: \x{a0}xxx\x{85}
  615. /\S \S/I,utf,tables=2
  616. Capture group count = 0
  617. Options: utf
  618. Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0e \x0f
  619. \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e
  620. \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C
  621. D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e f g h
  622. i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82 \x83 \x84
  623. \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94
  624. \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa1 \xa2 \xa3 \xa4
  625. \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3
  626. \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2
  627. \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1
  628. \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0
  629. \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef
  630. \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe
  631. \xff
  632. Last code unit = ' '
  633. Subject length lower bound = 3
  634. \x{a2} \x{84}
  635. 0: \x{a2} \x{84}
  636. A Z
  637. 0: A Z
  638. /a+/utf
  639. a\x{123}aa\=offset=1
  640. 0: aa
  641. a\x{123}aa\=offset=2
  642. 0: aa
  643. a\x{123}aa\=offset=3
  644. 0: a
  645. \= Expect no match
  646. a\x{123}aa\=offset=4
  647. No match
  648. \= Expect bad offset error
  649. a\x{123}aa\=offset=5
  650. Failed: error -33: bad offset value
  651. a\x{123}aa\=offset=6
  652. Failed: error -33: bad offset value
  653. /\x{1234}+/Ii,utf
  654. Capture group count = 0
  655. Options: caseless utf
  656. First code unit = \x{1234}
  657. Subject length lower bound = 1
  658. /\x{1234}+?/Ii,utf
  659. Capture group count = 0
  660. Options: caseless utf
  661. First code unit = \x{1234}
  662. Subject length lower bound = 1
  663. /\x{1234}++/Ii,utf
  664. Capture group count = 0
  665. Options: caseless utf
  666. First code unit = \x{1234}
  667. Subject length lower bound = 1
  668. /\x{1234}{2}/Ii,utf
  669. Capture group count = 0
  670. Options: caseless utf
  671. First code unit = \x{1234}
  672. Last code unit = \x{1234}
  673. Subject length lower bound = 2
  674. /[^\x{c4}]/IB,utf
  675. ------------------------------------------------------------------
  676. Bra
  677. [^\x{c4}]
  678. Ket
  679. End
  680. ------------------------------------------------------------------
  681. Capture group count = 0
  682. Options: utf
  683. Subject length lower bound = 1
  684. /X+\x{200}/IB,utf
  685. ------------------------------------------------------------------
  686. Bra
  687. X++
  688. \x{200}
  689. Ket
  690. End
  691. ------------------------------------------------------------------
  692. Capture group count = 0
  693. Options: utf
  694. First code unit = 'X'
  695. Last code unit = \x{200}
  696. Subject length lower bound = 2
  697. /\R/I,utf
  698. Capture group count = 0
  699. Options: utf
  700. Starting code units: \x0a \x0b \x0c \x0d \x85 \xff
  701. Subject length lower bound = 1
  702. # Check bad offset
  703. /a/utf
  704. \= Expect bad UTF-16 offset, or no match in 32-bit
  705. \x{10000}\=offset=1
  706. Error -36 (bad UTF-16 offset)
  707. \x{10000}ab\=offset=1
  708. Error -36 (bad UTF-16 offset)
  709. \= Expect 16-bit match, 32-bit no match
  710. \x{10000}ab\=offset=2
  711. 0: a
  712. \= Expect no match
  713. \x{10000}ab\=offset=3
  714. No match
  715. \= Expect no match in 16-bit, bad offset in 32-bit
  716. \x{10000}ab\=offset=4
  717. No match
  718. \= Expect bad offset
  719. \x{10000}ab\=offset=5
  720. Failed: error -33: bad offset value
  721. /���/utf
  722. Failed: error -26 at offset 0: UTF-16 error: isolated low surrogate
  723. /\w+\x{C4}/B,utf
  724. ------------------------------------------------------------------
  725. Bra
  726. \w++
  727. \x{c4}
  728. Ket
  729. End
  730. ------------------------------------------------------------------
  731. a\x{C4}\x{C4}
  732. 0: a\x{c4}
  733. /\w+\x{C4}/B,utf,tables=2
  734. ------------------------------------------------------------------
  735. Bra
  736. \w+
  737. \x{c4}
  738. Ket
  739. End
  740. ------------------------------------------------------------------
  741. a\x{C4}\x{C4}
  742. 0: a\x{c4}\x{c4}
  743. /\W+\x{C4}/B,utf
  744. ------------------------------------------------------------------
  745. Bra
  746. \W+
  747. \x{c4}
  748. Ket
  749. End
  750. ------------------------------------------------------------------
  751. !\x{C4}
  752. 0: !\x{c4}
  753. /\W+\x{C4}/B,utf,tables=2
  754. ------------------------------------------------------------------
  755. Bra
  756. \W++
  757. \x{c4}
  758. Ket
  759. End
  760. ------------------------------------------------------------------
  761. !\x{C4}
  762. 0: !\x{c4}
  763. /\W+\x{A1}/B,utf
  764. ------------------------------------------------------------------
  765. Bra
  766. \W+
  767. \x{a1}
  768. Ket
  769. End
  770. ------------------------------------------------------------------
  771. !\x{A1}
  772. 0: !\x{a1}
  773. /\W+\x{A1}/B,utf,tables=2
  774. ------------------------------------------------------------------
  775. Bra
  776. \W+
  777. \x{a1}
  778. Ket
  779. End
  780. ------------------------------------------------------------------
  781. !\x{A1}
  782. 0: !\x{a1}
  783. /X\s+\x{A0}/B,utf
  784. ------------------------------------------------------------------
  785. Bra
  786. X
  787. \s++
  788. \x{a0}
  789. Ket
  790. End
  791. ------------------------------------------------------------------
  792. X\x20\x{A0}\x{A0}
  793. 0: X \x{a0}
  794. /X\s+\x{A0}/B,utf,tables=2
  795. ------------------------------------------------------------------
  796. Bra
  797. X
  798. \s+
  799. \x{a0}
  800. Ket
  801. End
  802. ------------------------------------------------------------------
  803. X\x20\x{A0}\x{A0}
  804. 0: X \x{a0}\x{a0}
  805. /\S+\x{A0}/B,utf
  806. ------------------------------------------------------------------
  807. Bra
  808. \S+
  809. \x{a0}
  810. Ket
  811. End
  812. ------------------------------------------------------------------
  813. X\x{A0}\x{A0}
  814. 0: X\x{a0}\x{a0}
  815. /\S+\x{A0}/B,utf,tables=2
  816. ------------------------------------------------------------------
  817. Bra
  818. \S++
  819. \x{a0}
  820. Ket
  821. End
  822. ------------------------------------------------------------------
  823. X\x{A0}\x{A0}
  824. 0: X\x{a0}
  825. /\x{a0}+\s!/B,utf
  826. ------------------------------------------------------------------
  827. Bra
  828. \x{a0}++
  829. \s
  830. !
  831. Ket
  832. End
  833. ------------------------------------------------------------------
  834. \x{a0}\x20!
  835. 0: \x{a0} !
  836. /\x{a0}+\s!/B,utf,tables=2
  837. ------------------------------------------------------------------
  838. Bra
  839. \x{a0}+
  840. \s
  841. !
  842. Ket
  843. End
  844. ------------------------------------------------------------------
  845. \x{a0}\x20!
  846. 0: \x{a0} !
  847. /(*UTF)abc/never_utf
  848. Failed: error 174 at offset 6: using UTF is disabled by the application
  849. /abc/utf,never_utf
  850. Failed: error 174 at offset 0: using UTF is disabled by the application
  851. /A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf
  852. ------------------------------------------------------------------
  853. Bra
  854. /i A\x{391}\x{10427}\x{ff3a}\x{1fb0}
  855. Ket
  856. End
  857. ------------------------------------------------------------------
  858. Capture group count = 0
  859. Options: caseless utf
  860. First code unit = 'A' (caseless)
  861. Last code unit = \x{1fb0} (caseless)
  862. Subject length lower bound = 5
  863. /A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf
  864. ------------------------------------------------------------------
  865. Bra
  866. A\x{391}\x{10427}\x{ff3a}\x{1fb0}
  867. Ket
  868. End
  869. ------------------------------------------------------------------
  870. Capture group count = 0
  871. Options: utf
  872. First code unit = 'A'
  873. Last code unit = \x{1fb0}
  874. Subject length lower bound = 5
  875. /AB\x{1fb0}/IB,utf
  876. ------------------------------------------------------------------
  877. Bra
  878. AB\x{1fb0}
  879. Ket
  880. End
  881. ------------------------------------------------------------------
  882. Capture group count = 0
  883. Options: utf
  884. First code unit = 'A'
  885. Last code unit = \x{1fb0}
  886. Subject length lower bound = 3
  887. /AB\x{1fb0}/IBi,utf
  888. ------------------------------------------------------------------
  889. Bra
  890. /i AB\x{1fb0}
  891. Ket
  892. End
  893. ------------------------------------------------------------------
  894. Capture group count = 0
  895. Options: caseless utf
  896. First code unit = 'A' (caseless)
  897. Last code unit = \x{1fb0} (caseless)
  898. Subject length lower bound = 3
  899. /\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf
  900. Capture group count = 0
  901. Options: caseless utf
  902. First code unit = \x{401} (caseless)
  903. Last code unit = \x{42f} (caseless)
  904. Subject length lower bound = 17
  905. \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}
  906. 0: \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}
  907. \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
  908. 0: \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
  909. /[ⱥ]/Bi,utf
  910. ------------------------------------------------------------------
  911. Bra
  912. /i \x{2c65}
  913. Ket
  914. End
  915. ------------------------------------------------------------------
  916. /[^ⱥ]/Bi,utf
  917. ------------------------------------------------------------------
  918. Bra
  919. /i [^\x{2c65}]
  920. Ket
  921. End
  922. ------------------------------------------------------------------
  923. /[[:blank:]]/B,ucp
  924. ------------------------------------------------------------------
  925. Bra
  926. [\x09 \xa0\x{1680}\x{180e}\x{2000}-\x{200a}\x{202f}\x{205f}\x{3000}]
  927. Ket
  928. End
  929. ------------------------------------------------------------------
  930. /\x{212a}+/Ii,utf
  931. Capture group count = 0
  932. Options: caseless utf
  933. Starting code units: K k \xff
  934. Subject length lower bound = 1
  935. KKkk\x{212a}
  936. 0: KKkk\x{212a}
  937. /s+/Ii,utf
  938. Capture group count = 0
  939. Options: caseless utf
  940. Starting code units: S s \xff
  941. Subject length lower bound = 1
  942. SSss\x{17f}
  943. 0: SSss\x{17f}
  944. # Non-UTF characters should give errors in both 16-bit and 32-bit modes.
  945. /\x{110000}/utf
  946. Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too large
  947. /\o{4200000}/utf
  948. Failed: error 134 at offset 10: character code point value in \x{} or \o{} is too large
  949. /\x{100}*A/IB,utf
  950. ------------------------------------------------------------------
  951. Bra
  952. \x{100}*+
  953. A
  954. Ket
  955. End
  956. ------------------------------------------------------------------
  957. Capture group count = 0
  958. Options: utf
  959. Starting code units: A \xff
  960. Last code unit = 'A'
  961. Subject length lower bound = 1
  962. A
  963. 0: A
  964. /\x{100}*\d(?R)/IB,utf
  965. ------------------------------------------------------------------
  966. Bra
  967. \x{100}*+
  968. \d
  969. Recurse
  970. Ket
  971. End
  972. ------------------------------------------------------------------
  973. Capture group count = 0
  974. Options: utf
  975. Starting code units: 0 1 2 3 4 5 6 7 8 9 \xff
  976. Subject length lower bound = 1
  977. /[Z\x{100}]/IB,utf
  978. ------------------------------------------------------------------
  979. Bra
  980. [Z\x{100}]
  981. Ket
  982. End
  983. ------------------------------------------------------------------
  984. Capture group count = 0
  985. Options: utf
  986. Starting code units: Z \xff
  987. Subject length lower bound = 1
  988. Z\x{100}
  989. 0: Z
  990. \x{100}
  991. 0: \x{100}
  992. \x{100}Z
  993. 0: \x{100}
  994. /[z-\x{100}]/IB,utf
  995. ------------------------------------------------------------------
  996. Bra
  997. [z-\xff\x{100}]
  998. Ket
  999. End
  1000. ------------------------------------------------------------------
  1001. Capture group count = 0
  1002. Options: utf
  1003. Starting code units: z { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87
  1004. \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96
  1005. \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5
  1006. \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4
  1007. \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3
  1008. \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2
  1009. \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1
  1010. \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0
  1011. \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
  1012. Subject length lower bound = 1
  1013. /[z\Qa-d]Ā\E]/IB,utf
  1014. ------------------------------------------------------------------
  1015. Bra
  1016. [\-\]adz\x{100}]
  1017. Ket
  1018. End
  1019. ------------------------------------------------------------------
  1020. Capture group count = 0
  1021. Options: utf
  1022. Starting code units: - ] a d z \xff
  1023. Subject length lower bound = 1
  1024. \x{100}
  1025. 0: \x{100}
  1026. Ā
  1027. 0: \x{100}
  1028. /[ab\x{100}]abc(xyz(?1))/IB,utf
  1029. ------------------------------------------------------------------
  1030. Bra
  1031. [ab\x{100}]
  1032. abc
  1033. CBra 1
  1034. xyz
  1035. Recurse
  1036. Ket
  1037. Ket
  1038. End
  1039. ------------------------------------------------------------------
  1040. Capture group count = 1
  1041. Options: utf
  1042. Starting code units: a b \xff
  1043. Last code unit = 'z'
  1044. Subject length lower bound = 7
  1045. /\x{100}*\s/IB,utf
  1046. ------------------------------------------------------------------
  1047. Bra
  1048. \x{100}*+
  1049. \s
  1050. Ket
  1051. End
  1052. ------------------------------------------------------------------
  1053. Capture group count = 0
  1054. Options: utf
  1055. Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 \xff
  1056. Subject length lower bound = 1
  1057. /\x{100}*\d/IB,utf
  1058. ------------------------------------------------------------------
  1059. Bra
  1060. \x{100}*+
  1061. \d
  1062. Ket
  1063. End
  1064. ------------------------------------------------------------------
  1065. Capture group count = 0
  1066. Options: utf
  1067. Starting code units: 0 1 2 3 4 5 6 7 8 9 \xff
  1068. Subject length lower bound = 1
  1069. /\x{100}*\w/IB,utf
  1070. ------------------------------------------------------------------
  1071. Bra
  1072. \x{100}*+
  1073. \w
  1074. Ket
  1075. End
  1076. ------------------------------------------------------------------
  1077. Capture group count = 0
  1078. Options: utf
  1079. Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P
  1080. Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z
  1081. \xff
  1082. Subject length lower bound = 1
  1083. /\x{100}*\D/IB,utf
  1084. ------------------------------------------------------------------
  1085. Bra
  1086. \x{100}*
  1087. \D
  1088. Ket
  1089. End
  1090. ------------------------------------------------------------------
  1091. Capture group count = 0
  1092. Options: utf
  1093. Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
  1094. \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
  1095. \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / : ; < = >
  1096. ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c
  1097. d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82
  1098. \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91
  1099. \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0
  1100. \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf
  1101. \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe
  1102. \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd
  1103. \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc
  1104. \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb
  1105. \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa
  1106. \xfb \xfc \xfd \xfe \xff
  1107. Subject length lower bound = 1
  1108. /\x{100}*\S/IB,utf
  1109. ------------------------------------------------------------------
  1110. Bra
  1111. \x{100}*
  1112. \S
  1113. Ket
  1114. End
  1115. ------------------------------------------------------------------
  1116. Capture group count = 0
  1117. Options: utf
  1118. Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0e \x0f
  1119. \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e
  1120. \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C
  1121. D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e f g h
  1122. i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82 \x83 \x84
  1123. \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93
  1124. \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2
  1125. \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1
  1126. \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0
  1127. \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf
  1128. \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde
  1129. \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed
  1130. \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc
  1131. \xfd \xfe \xff
  1132. Subject length lower bound = 1
  1133. /\x{100}*\W/IB,utf
  1134. ------------------------------------------------------------------
  1135. Bra
  1136. \x{100}*
  1137. \W
  1138. Ket
  1139. End
  1140. ------------------------------------------------------------------
  1141. Capture group count = 0
  1142. Options: utf
  1143. Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
  1144. \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
  1145. \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / : ; < = >
  1146. ? @ [ \ ] ^ ` { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89
  1147. \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98
  1148. \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7
  1149. \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6
  1150. \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5
  1151. \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4
  1152. \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3
  1153. \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2
  1154. \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
  1155. Subject length lower bound = 1
  1156. /[\x{105}-\x{109}]/IBi,utf
  1157. ------------------------------------------------------------------
  1158. Bra
  1159. [\x{104}-\x{109}]
  1160. Ket
  1161. End
  1162. ------------------------------------------------------------------
  1163. Capture group count = 0
  1164. Options: caseless utf
  1165. Starting code units: \xff
  1166. Subject length lower bound = 1
  1167. \x{104}
  1168. 0: \x{104}
  1169. \x{105}
  1170. 0: \x{105}
  1171. \x{109}
  1172. 0: \x{109}
  1173. \= Expect no match
  1174. \x{100}
  1175. No match
  1176. \x{10a}
  1177. No match
  1178. /[z-\x{100}]/IBi,utf
  1179. ------------------------------------------------------------------
  1180. Bra
  1181. [Zz-\xff\x{39c}\x{3bc}\x{212b}\x{1e9e}\x{212b}\x{178}\x{100}-\x{101}]
  1182. Ket
  1183. End
  1184. ------------------------------------------------------------------
  1185. Capture group count = 0
  1186. Options: caseless utf
  1187. Starting code units: Z z { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 \x85 \x86
  1188. \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95
  1189. \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4
  1190. \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3
  1191. \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2
  1192. \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1
  1193. \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0
  1194. \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef
  1195. \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe
  1196. \xff
  1197. Subject length lower bound = 1
  1198. Z
  1199. 0: Z
  1200. z
  1201. 0: z
  1202. \x{39c}
  1203. 0: \x{39c}
  1204. \x{178}
  1205. 0: \x{178}
  1206. |
  1207. 0: |
  1208. \x{80}
  1209. 0: \x{80}
  1210. \x{ff}
  1211. 0: \x{ff}
  1212. \x{100}
  1213. 0: \x{100}
  1214. \x{101}
  1215. 0: \x{101}
  1216. \= Expect no match
  1217. \x{102}
  1218. No match
  1219. Y
  1220. No match
  1221. y
  1222. No match
  1223. /[z-\x{100}]/IBi,utf
  1224. ------------------------------------------------------------------
  1225. Bra
  1226. [Zz-\xff\x{39c}\x{3bc}\x{212b}\x{1e9e}\x{212b}\x{178}\x{100}-\x{101}]
  1227. Ket
  1228. End
  1229. ------------------------------------------------------------------
  1230. Capture group count = 0
  1231. Options: caseless utf
  1232. Starting code units: Z z { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 \x85 \x86
  1233. \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95
  1234. \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4
  1235. \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3
  1236. \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2
  1237. \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1
  1238. \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0
  1239. \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef
  1240. \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe
  1241. \xff
  1242. Subject length lower bound = 1
  1243. /\x{3a3}B/IBi,utf
  1244. ------------------------------------------------------------------
  1245. Bra
  1246. clist 03a3 03c2 03c3
  1247. /i B
  1248. Ket
  1249. End
  1250. ------------------------------------------------------------------
  1251. Capture group count = 0
  1252. Options: caseless utf
  1253. Starting code units: \xff
  1254. Last code unit = 'B' (caseless)
  1255. Subject length lower bound = 2
  1256. /./utf
  1257. \x{110000}
  1258. ** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
  1259. /(*UTF)ab������z/B
  1260. ------------------------------------------------------------------
  1261. Bra
  1262. ab\x{fd}\x{bf}\x{bf}\x{bf}\x{bf}\x{bf}z
  1263. Ket
  1264. End
  1265. ------------------------------------------------------------------
  1266. /ab������z/utf
  1267. ** Failed: character value greater than 0x10ffff cannot be converted to UTF
  1268. /[\W\p{Any}]/B
  1269. ------------------------------------------------------------------
  1270. Bra
  1271. [\x00-/:-@[-^`{-\xff\p{Any}\x{100}-\x{ffff}]
  1272. Ket
  1273. End
  1274. ------------------------------------------------------------------
  1275. abc
  1276. 0: a
  1277. 123
  1278. 0: 1
  1279. /[\W\pL]/B
  1280. ------------------------------------------------------------------
  1281. Bra
  1282. [\x00-/:-@[-^`{-\xff\p{L}\x{100}-\x{ffff}]
  1283. Ket
  1284. End
  1285. ------------------------------------------------------------------
  1286. abc
  1287. 0: a
  1288. \x{100}
  1289. 0: \x{100}
  1290. \x{308}
  1291. 0: \x{308}
  1292. \= Expect no match
  1293. 123
  1294. No match
  1295. /[\s[:^ascii:]]/B,ucp
  1296. ------------------------------------------------------------------
  1297. Bra
  1298. [\x80-\xff\p{Xsp}\x{100}-\x{ffff}]
  1299. Ket
  1300. End
  1301. ------------------------------------------------------------------
  1302. /\pP/ucp
  1303. \x{7fffffff}
  1304. ** Character \x{7fffffff} is greater than 0xffff and UTF-16 mode is not enabled.
  1305. ** Truncation will probably give the wrong result.
  1306. No match
  1307. # A special extra option allows excaped surrogate code points in 32-bit mode,
  1308. # but subjects containing them must not be UTF-checked. These patterns give
  1309. # errors in 16-bit mode.
  1310. /\x{d800}/I,utf,allow_surrogate_escapes
  1311. Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode
  1312. \x{d800}\=no_utf_check
  1313. /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
  1314. Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode
  1315. \x{dfff}\x{df01}\=no_utf_check
  1316. # This has different starting code units in 8-bit mode.
  1317. /^[^ab]/IB,utf
  1318. ------------------------------------------------------------------
  1319. Bra
  1320. ^
  1321. [\x00-`c-\xff] (neg)
  1322. Ket
  1323. End
  1324. ------------------------------------------------------------------
  1325. Capture group count = 0
  1326. Compile options: utf
  1327. Overall options: anchored utf
  1328. Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
  1329. \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
  1330. \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4
  1331. 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y
  1332. Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f
  1333. \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e
  1334. \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d
  1335. \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac
  1336. \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb
  1337. \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca
  1338. \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9
  1339. \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8
  1340. \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7
  1341. \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
  1342. Subject length lower bound = 1
  1343. c
  1344. 0: c
  1345. \x{ff}
  1346. 0: \x{ff}
  1347. \x{100}
  1348. 0: \x{100}
  1349. \= Expect no match
  1350. aaa
  1351. No match
  1352. # Offsets are different in 8-bit mode.
  1353. /(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
  1354. 123abcáyzabcdef789abcሴqr
  1355. 1(2) Old 6 6 "" New 6 8 "<>"
  1356. 2(2) Old 12 12 "" New 14 16 "<>"
  1357. 3(2) Old 12 15 "def" New 16 21 "<def>"
  1358. 4(2) Old 21 21 "" New 27 29 "<>"
  1359. 4: 123abc<>\x{e1}yzabc<><def>789abc<>\x{1234}qr
  1360. # A few script run tests in non-UTF mode (but they need Unicode support)
  1361. /^(*script_run:.{4})/
  1362. \x{3041}\x{30a1}\x{3007}\x{3007} Hiragana Katakana Han Han
  1363. 0: \x{3041}\x{30a1}\x{3007}\x{3007}
  1364. \x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han
  1365. 0: \x{30a1}\x{3041}\x{3007}\x{3007}
  1366. \x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul
  1367. 0: \x{1100}\x{2e80}\x{2e80}\x{1101}
  1368. /^(*sr:.*)/utf,allow_surrogate_escapes
  1369. Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode
  1370. \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana
  1371. \x{d800}\x{dfff} Surrogates (Unknown) \=no_utf_check
  1372. /(?(n/utf
  1373. Failed: error 142 at offset 4: syntax error in subpattern name (missing terminator?)
  1374. /(?(á/utf
  1375. Failed: error 142 at offset 4: syntax error in subpattern name (missing terminator?)
  1376. # Invalid UTF-16/32 tests.
  1377. /.../g,match_invalid_utf
  1378. abcd\x{df00}wxzy\x{df00}pqrs
  1379. 0: abc
  1380. 0: wxz
  1381. 0: pqr
  1382. abcd\x{80}wxzy\x{df00}pqrs
  1383. 0: abc
  1384. 0: d\x{80}w
  1385. 0: xzy
  1386. 0: pqr
  1387. /abc/match_invalid_utf
  1388. ab\x{df00}ab\=ph
  1389. Partial match: ab
  1390. \= Expect no match
  1391. ab\x{df00}cdef\=ph
  1392. No match
  1393. /.a/match_invalid_utf
  1394. ab\=ph
  1395. Partial match: b
  1396. ab\=ps
  1397. Partial match: b
  1398. \= Expect no match
  1399. b\x{df00}\=ph
  1400. No match
  1401. b\x{df00}\=ps
  1402. No match
  1403. /.a$/match_invalid_utf
  1404. ab\=ph
  1405. Partial match: b
  1406. ab\=ps
  1407. Partial match: b
  1408. \= Expect no match
  1409. b\x{df00}\=ph
  1410. No match
  1411. b\x{df00}\=ps
  1412. No match
  1413. /ab$/match_invalid_utf
  1414. ab\x{df00}cdeab
  1415. 0: ab
  1416. \= Expect no match
  1417. ab\x{df00}cde
  1418. No match
  1419. /.../g,match_invalid_utf
  1420. abcd\x{80}wxzy\x{df00}pqrs
  1421. 0: abc
  1422. 0: d\x{80}w
  1423. 0: xzy
  1424. 0: pqr
  1425. /(?<=x)../g,match_invalid_utf
  1426. abcd\x{80}wxzy\x{df00}pqrs
  1427. 0: zy
  1428. abcd\x{80}wxzy\x{df00}xpqrs
  1429. 0: zy
  1430. 0: pq
  1431. /X$/match_invalid_utf
  1432. \= Expect no match
  1433. X\x{df00}
  1434. No match
  1435. /(?<=..)X/match_invalid_utf,aftertext
  1436. AB\x{df00}AQXYZ
  1437. 0: X
  1438. 0+ YZ
  1439. AB\x{df00}AQXYZ\=offset=5
  1440. 0: X
  1441. 0+ YZ
  1442. AB\x{df00}\x{df00}AXYZXC\=offset=5
  1443. 0: X
  1444. 0+ C
  1445. \= Expect no match
  1446. AB\x{df00}XYZ
  1447. No match
  1448. AB\x{df00}XYZ\=offset=3
  1449. No match
  1450. AB\x{df00}AXYZ
  1451. No match
  1452. AB\x{df00}AXYZ\=offset=4
  1453. No match
  1454. AB\x{df00}\x{df00}AXYZ\=offset=5
  1455. No match
  1456. /.../match_invalid_utf
  1457. \= Expect no match
  1458. A\x{d800}B
  1459. No match
  1460. A\x{110000}B
  1461. ** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
  1462. /aa/utf,ucp,match_invalid_utf,global
  1463. aa\x{d800}aa
  1464. 0: aa
  1465. 0: aa
  1466. /aa/utf,ucp,match_invalid_utf,global
  1467. \x{d800}aa
  1468. 0: aa
  1469. /A\z/utf,match_invalid_utf
  1470. A\x{df00}\n
  1471. No match
  1472. # ----------------------------------------------------
  1473. /(*UTF)(?=\x{123})/I
  1474. Capture group count = 0
  1475. May match empty string
  1476. Compile options: <none>
  1477. Overall options: utf
  1478. First code unit = \x{123}
  1479. Subject length lower bound = 1
  1480. /[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
  1481. Capture group count = 0
  1482. Options: utf
  1483. First code unit = \xc1 (caseless)
  1484. Last code unit = \x{145} (caseless)
  1485. Subject length lower bound = 3
  1486. /[\xff\x{ffff}]/I,utf
  1487. Capture group count = 0
  1488. Options: utf
  1489. Starting code units: \xff
  1490. Subject length lower bound = 1
  1491. /[\xff\x{ff}]/I,utf
  1492. Capture group count = 0
  1493. Options: utf
  1494. Starting code units: \xff
  1495. Subject length lower bound = 1
  1496. /[\xff\x{ff}]/I
  1497. Capture group count = 0
  1498. Starting code units: \xff
  1499. Subject length lower bound = 1
  1500. /[Ss]/I
  1501. Capture group count = 0
  1502. First code unit = 'S' (caseless)
  1503. Subject length lower bound = 1
  1504. /[Ss]/I,utf
  1505. Capture group count = 0
  1506. Options: utf
  1507. Starting code units: S s
  1508. Subject length lower bound = 1
  1509. /(?:\x{ff}|\x{3000})/I,utf
  1510. Capture group count = 0
  1511. Options: utf
  1512. Starting code units: \xff
  1513. Subject length lower bound = 1
  1514. # ----------------------------------------------------
  1515. # UCP and casing tests
  1516. /\x{120}/i,I
  1517. Capture group count = 0
  1518. Options: caseless
  1519. First code unit = \x{120}
  1520. Subject length lower bound = 1
  1521. /\x{c1}/i,I,ucp
  1522. Capture group count = 0
  1523. Options: caseless ucp
  1524. First code unit = \xc1 (caseless)
  1525. Subject length lower bound = 1
  1526. /[\x{120}\x{121}]/iB,ucp
  1527. ------------------------------------------------------------------
  1528. Bra
  1529. /i \x{120}
  1530. Ket
  1531. End
  1532. ------------------------------------------------------------------
  1533. /[ab\x{120}]+/iB,ucp
  1534. ------------------------------------------------------------------
  1535. Bra
  1536. [ABab\x{120}-\x{121}]++
  1537. Ket
  1538. End
  1539. ------------------------------------------------------------------
  1540. aABb\x{121}\x{120}
  1541. 0: aABb\x{121}\x{120}
  1542. /\x{c1}/i,no_start_optimize
  1543. \= Expect no match
  1544. \x{e1}
  1545. No match
  1546. /\x{120}\x{c1}/i,ucp,no_start_optimize
  1547. \x{121}\x{e1}
  1548. 0: \x{121}\xe1
  1549. /\x{120}\x{c1}/i,ucp
  1550. \x{121}\x{e1}
  1551. 0: \x{121}\xe1
  1552. /[^\x{120}]/i,no_start_optimize
  1553. \x{121}
  1554. 0: \x{121}
  1555. /[^\x{120}]/i,ucp,no_start_optimize
  1556. \= Expect no match
  1557. \x{121}
  1558. No match
  1559. /[^\x{120}]/i
  1560. \x{121}
  1561. 0: \x{121}
  1562. /[^\x{120}]/i,ucp
  1563. \= Expect no match
  1564. \x{121}
  1565. No match
  1566. /\x{120}{2}/i,ucp
  1567. \x{121}\x{121}
  1568. 0: \x{121}\x{121}
  1569. /[^\x{120}]{2}/i,ucp
  1570. \= Expect no match
  1571. \x{121}\x{121}
  1572. No match
  1573. /\x{c1}+\x{e1}/iB,ucp
  1574. ------------------------------------------------------------------
  1575. Bra
  1576. /i \x{c1}+
  1577. /i \x{e1}
  1578. Ket
  1579. End
  1580. ------------------------------------------------------------------
  1581. \x{c1}\x{c1}\x{c1}
  1582. 0: \xc1\xc1\xc1
  1583. /\x{c1}+\x{e1}/iIB,ucp
  1584. ------------------------------------------------------------------
  1585. Bra
  1586. /i \x{c1}+
  1587. /i \x{e1}
  1588. Ket
  1589. End
  1590. ------------------------------------------------------------------
  1591. Capture group count = 0
  1592. Options: caseless ucp
  1593. First code unit = \xc1 (caseless)
  1594. Last code unit = \xe1 (caseless)
  1595. Subject length lower bound = 2
  1596. \x{c1}\x{c1}\x{c1}
  1597. 0: \xc1\xc1\xc1
  1598. \x{e1}\x{e1}\x{e1}
  1599. 0: \xe1\xe1\xe1
  1600. /a|\x{c1}/iI,ucp
  1601. Capture group count = 0
  1602. Options: caseless ucp
  1603. Starting code units: A a \xc1 \xe1
  1604. Subject length lower bound = 1
  1605. \x{e1}xxx
  1606. 0: \xe1
  1607. /\x{c1}|\x{e1}/iI,ucp
  1608. Capture group count = 0
  1609. Options: caseless ucp
  1610. First code unit = \xc1 (caseless)
  1611. Subject length lower bound = 1
  1612. /X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
  1613. X\x{e1}Y
  1614. 1: >\xc1<
  1615. /X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
  1616. X\x{121}Y
  1617. 1: >\x{120}<
  1618. /s/i,ucp
  1619. \x{17f}
  1620. 0: \x{17f}
  1621. /s/i,utf
  1622. \x{17f}
  1623. 0: \x{17f}
  1624. /[^s]/i,ucp
  1625. \= Expect no match
  1626. \x{17f}
  1627. No match
  1628. /[^s]/i,utf
  1629. \= Expect no match
  1630. \x{17f}
  1631. No match
  1632. # ----------------------------------------------------
  1633. # Quantifier after a literal that has the value of META_ACCEPT (not UTF). This
  1634. # fails in 16-bit mode, but is OK for 32-bit.
  1635. /\x{802a0000}*/
  1636. Failed: error 134 at offset 11: character code point value in \x{} or \o{} is too large
  1637. \x{802a0000}\x{802a0000}
  1638. # UTF matching without UTF, check invalid UTF characters
  1639. /\X++/
  1640. a\x{110000}\x{ffffffff}
  1641. ** Character \x{110000} is greater than 0xffff and UTF-16 mode is not enabled.
  1642. ** Truncation will probably give the wrong result.
  1643. ** Character \x{ffffffff} is greater than 0xffff and UTF-16 mode is not enabled.
  1644. ** Truncation will probably give the wrong result.
  1645. 0: a\x00\x{ffff}
  1646. # This used to loop in 32-bit mode; it will fail in 16-bit mode.
  1647. /[\x{ffffffff}]/caseless,ucp
  1648. Failed: error 134 at offset 12: character code point value in \x{} or \o{} is too large
  1649. \x{ffffffff}xyz
  1650. # These are 32-bit tests for handing 0xffffffff when in UCP caselsss mode. They
  1651. # will give errors in 16-bit mode.
  1652. /k*\x{ffffffff}/caseless,ucp
  1653. Failed: error 134 at offset 13: character code point value in \x{} or \o{} is too large
  1654. \x{ffffffff}
  1655. /k+\x{ffffffff}/caseless,ucp,no_start_optimize
  1656. Failed: error 134 at offset 13: character code point value in \x{} or \o{} is too large
  1657. K\x{ffffffff}
  1658. \= Expect no match
  1659. \x{ffffffff}\x{ffffffff}
  1660. /k{2}\x{ffffffff}/caseless,ucp,no_start_optimize
  1661. Failed: error 134 at offset 15: character code point value in \x{} or \o{} is too large
  1662. \= Expect no match
  1663. \x{ffffffff}\x{ffffffff}\x{ffffffff}
  1664. /k\x{ffffffff}/caseless,ucp,no_start_optimize
  1665. Failed: error 134 at offset 12: character code point value in \x{} or \o{} is too large
  1666. K\x{ffffffff}
  1667. \= Expect no match
  1668. \x{ffffffff}\x{ffffffff}\x{ffffffff}
  1669. /k{2,}?Z/caseless,ucp,no_start_optimize,no_auto_possess
  1670. \= Expect no match
  1671. Kk\x{ffffffff}\x{ffffffff}\x{ffffffff}Z
  1672. ** Character \x{ffffffff} is greater than 0xffff and UTF-16 mode is not enabled.
  1673. ** Truncation will probably give the wrong result.
  1674. ** Character \x{ffffffff} is greater than 0xffff and UTF-16 mode is not enabled.
  1675. ** Truncation will probably give the wrong result.
  1676. ** Character \x{ffffffff} is greater than 0xffff and UTF-16 mode is not enabled.
  1677. ** Truncation will probably give the wrong result.
  1678. No match
  1679. # ---------------------------------------------------------
  1680. # End of testinput12