github的一些开源项目
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1928 lines
50 KiB

  1. # This set of tests is for UTF-8 support and Unicode property support, with
  2. # relevance only for the 8-bit library.
  3. #newline_default lf any anycrlf
  4. # The next 5 patterns have UTF-8 errors
  5. /[�]/utf
  6. Failed: error -8 at offset 1: UTF-8 error: byte 2 top bits not 0x80
  7. /�/utf
  8. Failed: error -3 at offset 0: UTF-8 error: 1 byte missing at end
  9. /���xxx/utf
  10. Failed: error -8 at offset 0: UTF-8 error: byte 2 top bits not 0x80
  11. /��������/utf
  12. Failed: error -22 at offset 2: UTF-8 error: isolated byte with 0x80 bit set
  13. /��������/match_invalid_utf
  14. Failed: error -22 at offset 2: UTF-8 error: isolated byte with 0x80 bit set
  15. # Now test subjects
  16. /badutf/utf
  17. \= Expect UTF-8 errors
  18. X\xdf
  19. Failed: error -3: UTF-8 error: 1 byte missing at end at offset 1
  20. XX\xef
  21. Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 2
  22. XXX\xef\x80
  23. Failed: error -3: UTF-8 error: 1 byte missing at end at offset 3
  24. X\xf7
  25. Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 1
  26. XX\xf7\x80
  27. Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 2
  28. XXX\xf7\x80\x80
  29. Failed: error -3: UTF-8 error: 1 byte missing at end at offset 3
  30. \xfb
  31. Failed: error -6: UTF-8 error: 4 bytes missing at end at offset 0
  32. \xfb\x80
  33. Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0
  34. \xfb\x80\x80
  35. Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0
  36. \xfb\x80\x80\x80
  37. Failed: error -3: UTF-8 error: 1 byte missing at end at offset 0
  38. \xfd
  39. Failed: error -7: UTF-8 error: 5 bytes missing at end at offset 0
  40. \xfd\x80
  41. Failed: error -6: UTF-8 error: 4 bytes missing at end at offset 0
  42. \xfd\x80\x80
  43. Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0
  44. \xfd\x80\x80\x80
  45. Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0
  46. \xfd\x80\x80\x80\x80
  47. Failed: error -3: UTF-8 error: 1 byte missing at end at offset 0
  48. \xdf\x7f
  49. Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 0
  50. \xef\x7f\x80
  51. Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 0
  52. \xef\x80\x7f
  53. Failed: error -9: UTF-8 error: byte 3 top bits not 0x80 at offset 0
  54. \xf7\x7f\x80\x80
  55. Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 0
  56. \xf7\x80\x7f\x80
  57. Failed: error -9: UTF-8 error: byte 3 top bits not 0x80 at offset 0
  58. \xf7\x80\x80\x7f
  59. Failed: error -10: UTF-8 error: byte 4 top bits not 0x80 at offset 0
  60. \xfb\x7f\x80\x80\x80
  61. Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 0
  62. \xfb\x80\x7f\x80\x80
  63. Failed: error -9: UTF-8 error: byte 3 top bits not 0x80 at offset 0
  64. \xfb\x80\x80\x7f\x80
  65. Failed: error -10: UTF-8 error: byte 4 top bits not 0x80 at offset 0
  66. \xfb\x80\x80\x80\x7f
  67. Failed: error -11: UTF-8 error: byte 5 top bits not 0x80 at offset 0
  68. \xfd\x7f\x80\x80\x80\x80
  69. Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 0
  70. \xfd\x80\x7f\x80\x80\x80
  71. Failed: error -9: UTF-8 error: byte 3 top bits not 0x80 at offset 0
  72. \xfd\x80\x80\x7f\x80\x80
  73. Failed: error -10: UTF-8 error: byte 4 top bits not 0x80 at offset 0
  74. \xfd\x80\x80\x80\x7f\x80
  75. Failed: error -11: UTF-8 error: byte 5 top bits not 0x80 at offset 0
  76. \xfd\x80\x80\x80\x80\x7f
  77. Failed: error -12: UTF-8 error: byte 6 top bits not 0x80 at offset 0
  78. \xed\xa0\x80
  79. Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 0
  80. \xc0\x8f
  81. Failed: error -17: UTF-8 error: overlong 2-byte sequence at offset 0
  82. \xe0\x80\x8f
  83. Failed: error -18: UTF-8 error: overlong 3-byte sequence at offset 0
  84. \xf0\x80\x80\x8f
  85. Failed: error -19: UTF-8 error: overlong 4-byte sequence at offset 0
  86. \xf8\x80\x80\x80\x8f
  87. Failed: error -20: UTF-8 error: overlong 5-byte sequence at offset 0
  88. \xfc\x80\x80\x80\x80\x8f
  89. Failed: error -21: UTF-8 error: overlong 6-byte sequence at offset 0
  90. \x80
  91. Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 0
  92. \xfe
  93. Failed: error -23: UTF-8 error: illegal byte (0xfe or 0xff) at offset 0
  94. \xff
  95. Failed: error -23: UTF-8 error: illegal byte (0xfe or 0xff) at offset 0
  96. /badutf/utf
  97. \= Expect UTF-8 errors
  98. XX\xfb\x80\x80\x80\x80
  99. Failed: error -13: UTF-8 error: 5-byte character is not allowed (RFC 3629) at offset 2
  100. XX\xfd\x80\x80\x80\x80\x80
  101. Failed: error -14: UTF-8 error: 6-byte character is not allowed (RFC 3629) at offset 2
  102. XX\xf7\xbf\xbf\xbf
  103. Failed: error -15: UTF-8 error: code points greater than 0x10ffff are not defined at offset 2
  104. /shortutf/utf
  105. \= Expect UTF-8 errors
  106. XX\xdf\=ph
  107. Failed: error -3: UTF-8 error: 1 byte missing at end at offset 2
  108. XX\xef\=ph
  109. Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 2
  110. XX\xef\x80\=ph
  111. Failed: error -3: UTF-8 error: 1 byte missing at end at offset 2
  112. \xf7\=ph
  113. Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0
  114. \xf7\x80\=ph
  115. Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0
  116. \xf7\x80\x80\=ph
  117. Failed: error -3: UTF-8 error: 1 byte missing at end at offset 0
  118. \xfb\=ph
  119. Failed: error -6: UTF-8 error: 4 bytes missing at end at offset 0
  120. \xfb\x80\=ph
  121. Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0
  122. \xfb\x80\x80\=ph
  123. Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0
  124. \xfb\x80\x80\x80\=ph
  125. Failed: error -3: UTF-8 error: 1 byte missing at end at offset 0
  126. \xfd\=ph
  127. Failed: error -7: UTF-8 error: 5 bytes missing at end at offset 0
  128. \xfd\x80\=ph
  129. Failed: error -6: UTF-8 error: 4 bytes missing at end at offset 0
  130. \xfd\x80\x80\=ph
  131. Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0
  132. \xfd\x80\x80\x80\=ph
  133. Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0
  134. \xfd\x80\x80\x80\x80\=ph
  135. Failed: error -3: UTF-8 error: 1 byte missing at end at offset 0
  136. /anything/utf
  137. \= Expect UTF-8 errors
  138. X\xc0\x80
  139. Failed: error -17: UTF-8 error: overlong 2-byte sequence at offset 1
  140. XX\xc1\x8f
  141. Failed: error -17: UTF-8 error: overlong 2-byte sequence at offset 2
  142. XXX\xe0\x9f\x80
  143. Failed: error -18: UTF-8 error: overlong 3-byte sequence at offset 3
  144. \xf0\x8f\x80\x80
  145. Failed: error -19: UTF-8 error: overlong 4-byte sequence at offset 0
  146. \xf8\x87\x80\x80\x80
  147. Failed: error -20: UTF-8 error: overlong 5-byte sequence at offset 0
  148. \xfc\x83\x80\x80\x80\x80
  149. Failed: error -21: UTF-8 error: overlong 6-byte sequence at offset 0
  150. \xfe\x80\x80\x80\x80\x80
  151. Failed: error -23: UTF-8 error: illegal byte (0xfe or 0xff) at offset 0
  152. \xff\x80\x80\x80\x80\x80
  153. Failed: error -23: UTF-8 error: illegal byte (0xfe or 0xff) at offset 0
  154. \xf8\x88\x80\x80\x80
  155. Failed: error -13: UTF-8 error: 5-byte character is not allowed (RFC 3629) at offset 0
  156. \xf9\x87\x80\x80\x80
  157. Failed: error -13: UTF-8 error: 5-byte character is not allowed (RFC 3629) at offset 0
  158. \xfc\x84\x80\x80\x80\x80
  159. Failed: error -14: UTF-8 error: 6-byte character is not allowed (RFC 3629) at offset 0
  160. \xfd\x83\x80\x80\x80\x80
  161. Failed: error -14: UTF-8 error: 6-byte character is not allowed (RFC 3629) at offset 0
  162. \= Expect no match
  163. \xc3\x8f
  164. No match
  165. \xe0\xaf\x80
  166. No match
  167. \xe1\x80\x80
  168. No match
  169. \xf0\x9f\x80\x80
  170. No match
  171. \xf1\x8f\x80\x80
  172. No match
  173. \xf8\x88\x80\x80\x80\=no_utf_check
  174. No match
  175. \xf9\x87\x80\x80\x80\=no_utf_check
  176. No match
  177. \xfc\x84\x80\x80\x80\x80\=no_utf_check
  178. No match
  179. \xfd\x83\x80\x80\x80\x80\=no_utf_check
  180. No match
  181. # Similar tests with offsets
  182. /badutf/utf
  183. \= Expect UTF-8 errors
  184. X\xdfabcd
  185. Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
  186. X\xdfabcd\=offset=1
  187. Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
  188. \= Expect no match
  189. X\xdfabcd\=offset=2
  190. No match
  191. /(?<=x)badutf/utf
  192. \= Expect UTF-8 errors
  193. X\xdfabcd
  194. Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
  195. X\xdfabcd\=offset=1
  196. Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
  197. X\xdfabcd\=offset=2
  198. Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
  199. X\xdfabcd\xdf\=offset=3
  200. Failed: error -3: UTF-8 error: 1 byte missing at end at offset 6
  201. \= Expect no match
  202. X\xdfabcd\=offset=3
  203. No match
  204. /(?<=xx)badutf/utf
  205. \= Expect UTF-8 errors
  206. X\xdfabcd
  207. Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
  208. X\xdfabcd\=offset=1
  209. Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
  210. X\xdfabcd\=offset=2
  211. Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
  212. X\xdfabcd\=offset=3
  213. Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
  214. /(?<=xxxx)badutf/utf
  215. \= Expect UTF-8 errors
  216. X\xdfabcd
  217. Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
  218. X\xdfabcd\=offset=1
  219. Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
  220. X\xdfabcd\=offset=2
  221. Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
  222. X\xdfabcd\=offset=3
  223. Failed: error -8: UTF-8 error: byte 2 top bits not 0x80 at offset 1
  224. X\xdfabc\xdf\=offset=6
  225. Failed: error -3: UTF-8 error: 1 byte missing at end at offset 5
  226. X\xdfabc\xdf\=offset=7
  227. Failed: error -33: bad offset value
  228. \= Expect no match
  229. X\xdfabcd\=offset=6
  230. No match
  231. /\x{100}/IB,utf
  232. ------------------------------------------------------------------
  233. Bra
  234. \x{100}
  235. Ket
  236. End
  237. ------------------------------------------------------------------
  238. Capture group count = 0
  239. Options: utf
  240. First code unit = \xc4
  241. Last code unit = \x80
  242. Subject length lower bound = 1
  243. /\x{1000}/IB,utf
  244. ------------------------------------------------------------------
  245. Bra
  246. \x{1000}
  247. Ket
  248. End
  249. ------------------------------------------------------------------
  250. Capture group count = 0
  251. Options: utf
  252. First code unit = \xe1
  253. Last code unit = \x80
  254. Subject length lower bound = 1
  255. /\x{10000}/IB,utf
  256. ------------------------------------------------------------------
  257. Bra
  258. \x{10000}
  259. Ket
  260. End
  261. ------------------------------------------------------------------
  262. Capture group count = 0
  263. Options: utf
  264. First code unit = \xf0
  265. Last code unit = \x80
  266. Subject length lower bound = 1
  267. /\x{100000}/IB,utf
  268. ------------------------------------------------------------------
  269. Bra
  270. \x{100000}
  271. Ket
  272. End
  273. ------------------------------------------------------------------
  274. Capture group count = 0
  275. Options: utf
  276. First code unit = \xf4
  277. Last code unit = \x80
  278. Subject length lower bound = 1
  279. /\x{10ffff}/IB,utf
  280. ------------------------------------------------------------------
  281. Bra
  282. \x{10ffff}
  283. Ket
  284. End
  285. ------------------------------------------------------------------
  286. Capture group count = 0
  287. Options: utf
  288. First code unit = \xf4
  289. Last code unit = \xbf
  290. Subject length lower bound = 1
  291. /[\x{ff}]/IB,utf
  292. ------------------------------------------------------------------
  293. Bra
  294. \x{ff}
  295. Ket
  296. End
  297. ------------------------------------------------------------------
  298. Capture group count = 0
  299. Options: utf
  300. First code unit = \xc3
  301. Last code unit = \xbf
  302. Subject length lower bound = 1
  303. /[\x{100}]/IB,utf
  304. ------------------------------------------------------------------
  305. Bra
  306. \x{100}
  307. Ket
  308. End
  309. ------------------------------------------------------------------
  310. Capture group count = 0
  311. Options: utf
  312. First code unit = \xc4
  313. Last code unit = \x80
  314. Subject length lower bound = 1
  315. /\x80/IB,utf
  316. ------------------------------------------------------------------
  317. Bra
  318. \x{80}
  319. Ket
  320. End
  321. ------------------------------------------------------------------
  322. Capture group count = 0
  323. Options: utf
  324. First code unit = \xc2
  325. Last code unit = \x80
  326. Subject length lower bound = 1
  327. /\xff/IB,utf
  328. ------------------------------------------------------------------
  329. Bra
  330. \x{ff}
  331. Ket
  332. End
  333. ------------------------------------------------------------------
  334. Capture group count = 0
  335. Options: utf
  336. First code unit = \xc3
  337. Last code unit = \xbf
  338. Subject length lower bound = 1
  339. /\x{D55c}\x{ad6d}\x{C5B4}/IB,utf
  340. ------------------------------------------------------------------
  341. Bra
  342. \x{d55c}\x{ad6d}\x{c5b4}
  343. Ket
  344. End
  345. ------------------------------------------------------------------
  346. Capture group count = 0
  347. Options: utf
  348. First code unit = \xed
  349. Last code unit = \xb4
  350. Subject length lower bound = 3
  351. \x{D55c}\x{ad6d}\x{C5B4}
  352. 0: \x{d55c}\x{ad6d}\x{c5b4}
  353. /\x{65e5}\x{672c}\x{8a9e}/IB,utf
  354. ------------------------------------------------------------------
  355. Bra
  356. \x{65e5}\x{672c}\x{8a9e}
  357. Ket
  358. End
  359. ------------------------------------------------------------------
  360. Capture group count = 0
  361. Options: utf
  362. First code unit = \xe6
  363. Last code unit = \x9e
  364. Subject length lower bound = 3
  365. \x{65e5}\x{672c}\x{8a9e}
  366. 0: \x{65e5}\x{672c}\x{8a9e}
  367. /\x{80}/IB,utf
  368. ------------------------------------------------------------------
  369. Bra
  370. \x{80}
  371. Ket
  372. End
  373. ------------------------------------------------------------------
  374. Capture group count = 0
  375. Options: utf
  376. First code unit = \xc2
  377. Last code unit = \x80
  378. Subject length lower bound = 1
  379. /\x{084}/IB,utf
  380. ------------------------------------------------------------------
  381. Bra
  382. \x{84}
  383. Ket
  384. End
  385. ------------------------------------------------------------------
  386. Capture group count = 0
  387. Options: utf
  388. First code unit = \xc2
  389. Last code unit = \x84
  390. Subject length lower bound = 1
  391. /\x{104}/IB,utf
  392. ------------------------------------------------------------------
  393. Bra
  394. \x{104}
  395. Ket
  396. End
  397. ------------------------------------------------------------------
  398. Capture group count = 0
  399. Options: utf
  400. First code unit = \xc4
  401. Last code unit = \x84
  402. Subject length lower bound = 1
  403. /\x{861}/IB,utf
  404. ------------------------------------------------------------------
  405. Bra
  406. \x{861}
  407. Ket
  408. End
  409. ------------------------------------------------------------------
  410. Capture group count = 0
  411. Options: utf
  412. First code unit = \xe0
  413. Last code unit = \xa1
  414. Subject length lower bound = 1
  415. /\x{212ab}/IB,utf
  416. ------------------------------------------------------------------
  417. Bra
  418. \x{212ab}
  419. Ket
  420. End
  421. ------------------------------------------------------------------
  422. Capture group count = 0
  423. Options: utf
  424. First code unit = \xf0
  425. Last code unit = \xab
  426. Subject length lower bound = 1
  427. /[^ab\xC0-\xF0]/IB,utf
  428. ------------------------------------------------------------------
  429. Bra
  430. [\x00-`c-\xbf\xf1-\xff] (neg)
  431. Ket
  432. End
  433. ------------------------------------------------------------------
  434. Capture group count = 0
  435. Options: utf
  436. Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
  437. \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
  438. \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4
  439. 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y
  440. Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f
  441. \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0
  442. \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf
  443. \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee
  444. \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd
  445. \xfe \xff
  446. Subject length lower bound = 1
  447. \x{f1}
  448. 0: \x{f1}
  449. \x{bf}
  450. 0: \x{bf}
  451. \x{100}
  452. 0: \x{100}
  453. \x{1000}
  454. 0: \x{1000}
  455. \= Expect no match
  456. \x{c0}
  457. No match
  458. \x{f0}
  459. No match
  460. /Ā{3,4}/IB,utf
  461. ------------------------------------------------------------------
  462. Bra
  463. \x{100}{3}
  464. \x{100}?+
  465. Ket
  466. End
  467. ------------------------------------------------------------------
  468. Capture group count = 0
  469. Options: utf
  470. First code unit = \xc4
  471. Last code unit = \x80
  472. Subject length lower bound = 3
  473. \x{100}\x{100}\x{100}\x{100\x{100}
  474. 0: \x{100}\x{100}\x{100}
  475. /(\x{100}+|x)/IB,utf
  476. ------------------------------------------------------------------
  477. Bra
  478. CBra 1
  479. \x{100}++
  480. Alt
  481. x
  482. Ket
  483. Ket
  484. End
  485. ------------------------------------------------------------------
  486. Capture group count = 1
  487. Options: utf
  488. Starting code units: x \xc4
  489. Subject length lower bound = 1
  490. /(\x{100}*a|x)/IB,utf
  491. ------------------------------------------------------------------
  492. Bra
  493. CBra 1
  494. \x{100}*+
  495. a
  496. Alt
  497. x
  498. Ket
  499. Ket
  500. End
  501. ------------------------------------------------------------------
  502. Capture group count = 1
  503. Options: utf
  504. Starting code units: a x \xc4
  505. Subject length lower bound = 1
  506. /(\x{100}{0,2}a|x)/IB,utf
  507. ------------------------------------------------------------------
  508. Bra
  509. CBra 1
  510. \x{100}{0,2}+
  511. a
  512. Alt
  513. x
  514. Ket
  515. Ket
  516. End
  517. ------------------------------------------------------------------
  518. Capture group count = 1
  519. Options: utf
  520. Starting code units: a x \xc4
  521. Subject length lower bound = 1
  522. /(\x{100}{1,2}a|x)/IB,utf
  523. ------------------------------------------------------------------
  524. Bra
  525. CBra 1
  526. \x{100}
  527. \x{100}{0,1}+
  528. a
  529. Alt
  530. x
  531. Ket
  532. Ket
  533. End
  534. ------------------------------------------------------------------
  535. Capture group count = 1
  536. Options: utf
  537. Starting code units: x \xc4
  538. Subject length lower bound = 1
  539. /\x{100}/IB,utf
  540. ------------------------------------------------------------------
  541. Bra
  542. \x{100}
  543. Ket
  544. End
  545. ------------------------------------------------------------------
  546. Capture group count = 0
  547. Options: utf
  548. First code unit = \xc4
  549. Last code unit = \x80
  550. Subject length lower bound = 1
  551. /a\x{100}\x{101}*/IB,utf
  552. ------------------------------------------------------------------
  553. Bra
  554. a\x{100}
  555. \x{101}*+
  556. Ket
  557. End
  558. ------------------------------------------------------------------
  559. Capture group count = 0
  560. Options: utf
  561. First code unit = 'a'
  562. Last code unit = \x80
  563. Subject length lower bound = 2
  564. /a\x{100}\x{101}+/IB,utf
  565. ------------------------------------------------------------------
  566. Bra
  567. a\x{100}
  568. \x{101}++
  569. Ket
  570. End
  571. ------------------------------------------------------------------
  572. Capture group count = 0
  573. Options: utf
  574. First code unit = 'a'
  575. Last code unit = \x81
  576. Subject length lower bound = 3
  577. /[^\x{c4}]/IB
  578. ------------------------------------------------------------------
  579. Bra
  580. [^\x{c4}]
  581. Ket
  582. End
  583. ------------------------------------------------------------------
  584. Capture group count = 0
  585. Subject length lower bound = 1
  586. /[\x{100}]/IB,utf
  587. ------------------------------------------------------------------
  588. Bra
  589. \x{100}
  590. Ket
  591. End
  592. ------------------------------------------------------------------
  593. Capture group count = 0
  594. Options: utf
  595. First code unit = \xc4
  596. Last code unit = \x80
  597. Subject length lower bound = 1
  598. \x{100}
  599. 0: \x{100}
  600. Z\x{100}
  601. 0: \x{100}
  602. \x{100}Z
  603. 0: \x{100}
  604. /[\xff]/IB,utf
  605. ------------------------------------------------------------------
  606. Bra
  607. \x{ff}
  608. Ket
  609. End
  610. ------------------------------------------------------------------
  611. Capture group count = 0
  612. Options: utf
  613. First code unit = \xc3
  614. Last code unit = \xbf
  615. Subject length lower bound = 1
  616. >\x{ff}<
  617. 0: \x{ff}
  618. /[^\xff]/IB,utf
  619. ------------------------------------------------------------------
  620. Bra
  621. [^\x{ff}]
  622. Ket
  623. End
  624. ------------------------------------------------------------------
  625. Capture group count = 0
  626. Options: utf
  627. Subject length lower bound = 1
  628. /\x{100}abc(xyz(?1))/IB,utf
  629. ------------------------------------------------------------------
  630. Bra
  631. \x{100}abc
  632. CBra 1
  633. xyz
  634. Recurse
  635. Ket
  636. Ket
  637. End
  638. ------------------------------------------------------------------
  639. Capture group count = 1
  640. Options: utf
  641. First code unit = \xc4
  642. Last code unit = 'z'
  643. Subject length lower bound = 7
  644. /\777/I,utf
  645. Capture group count = 0
  646. Options: utf
  647. First code unit = \xc7
  648. Last code unit = \xbf
  649. Subject length lower bound = 1
  650. \x{1ff}
  651. 0: \x{1ff}
  652. \777
  653. 0: \x{1ff}
  654. /\x{100}+\x{200}/IB,utf
  655. ------------------------------------------------------------------
  656. Bra
  657. \x{100}++
  658. \x{200}
  659. Ket
  660. End
  661. ------------------------------------------------------------------
  662. Capture group count = 0
  663. Options: utf
  664. First code unit = \xc4
  665. Last code unit = \x80
  666. Subject length lower bound = 2
  667. /\x{100}+X/IB,utf
  668. ------------------------------------------------------------------
  669. Bra
  670. \x{100}++
  671. X
  672. Ket
  673. End
  674. ------------------------------------------------------------------
  675. Capture group count = 0
  676. Options: utf
  677. First code unit = \xc4
  678. Last code unit = 'X'
  679. Subject length lower bound = 2
  680. /^[\QĀ\E-\QŐ\E/B,utf
  681. Failed: error 106 at offset 15: missing terminating ] for character class
  682. # This tests the stricter UTF-8 check according to RFC 3629.
  683. /X/utf
  684. \= Expect UTF-8 errors
  685. \x{d800}
  686. Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 0
  687. \x{da00}
  688. Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 0
  689. \x{dfff}
  690. Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 0
  691. \x{110000}
  692. Failed: error -15: UTF-8 error: code points greater than 0x10ffff are not defined at offset 0
  693. \x{2000000}
  694. Failed: error -13: UTF-8 error: 5-byte character is not allowed (RFC 3629) at offset 0
  695. \x{7fffffff}
  696. Failed: error -14: UTF-8 error: 6-byte character is not allowed (RFC 3629) at offset 0
  697. \= Expect no match
  698. \x{d800}\=no_utf_check
  699. No match
  700. \x{da00}\=no_utf_check
  701. No match
  702. \x{dfff}\=no_utf_check
  703. No match
  704. \x{110000}\=no_utf_check
  705. No match
  706. \x{2000000}\=no_utf_check
  707. No match
  708. \x{7fffffff}\=no_utf_check
  709. No match
  710. /(*UTF8)\x{1234}/
  711. abcd\x{1234}pqr
  712. 0: \x{1234}
  713. /(*CRLF)(*UTF)(*BSR_UNICODE)a\Rb/I
  714. Capture group count = 0
  715. Compile options: <none>
  716. Overall options: utf
  717. \R matches any Unicode newline
  718. Forced newline is CRLF
  719. First code unit = 'a'
  720. Last code unit = 'b'
  721. Subject length lower bound = 3
  722. /\h/I,utf
  723. Capture group count = 0
  724. Options: utf
  725. Starting code units: \x09 \x20 \xc2 \xe1 \xe2 \xe3
  726. Subject length lower bound = 1
  727. ABC\x{09}
  728. 0: \x{09}
  729. ABC\x{20}
  730. 0:
  731. ABC\x{a0}
  732. 0: \x{a0}
  733. ABC\x{1680}
  734. 0: \x{1680}
  735. ABC\x{180e}
  736. 0: \x{180e}
  737. ABC\x{2000}
  738. 0: \x{2000}
  739. ABC\x{202f}
  740. 0: \x{202f}
  741. ABC\x{205f}
  742. 0: \x{205f}
  743. ABC\x{3000}
  744. 0: \x{3000}
  745. /\v/I,utf
  746. Capture group count = 0
  747. Options: utf
  748. Starting code units: \x0a \x0b \x0c \x0d \xc2 \xe2
  749. Subject length lower bound = 1
  750. ABC\x{0a}
  751. 0: \x{0a}
  752. ABC\x{0b}
  753. 0: \x{0b}
  754. ABC\x{0c}
  755. 0: \x{0c}
  756. ABC\x{0d}
  757. 0: \x{0d}
  758. ABC\x{85}
  759. 0: \x{85}
  760. ABC\x{2028}
  761. 0: \x{2028}
  762. /\h*A/I,utf
  763. Capture group count = 0
  764. Options: utf
  765. Starting code units: \x09 \x20 A \xc2 \xe1 \xe2 \xe3
  766. Last code unit = 'A'
  767. Subject length lower bound = 1
  768. CDBABC
  769. 0: A
  770. /\v+A/I,utf
  771. Capture group count = 0
  772. Options: utf
  773. Starting code units: \x0a \x0b \x0c \x0d \xc2 \xe2
  774. Last code unit = 'A'
  775. Subject length lower bound = 2
  776. /\s?xxx\s/I,utf
  777. Capture group count = 0
  778. Options: utf
  779. Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 x
  780. Last code unit = 'x'
  781. Subject length lower bound = 4
  782. /\sxxx\s/I,utf,tables=2
  783. Capture group count = 0
  784. Options: utf
  785. Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 \xc2
  786. Last code unit = 'x'
  787. Subject length lower bound = 5
  788. AB\x{85}xxx\x{a0}XYZ
  789. 0: \x{85}xxx\x{a0}
  790. AB\x{a0}xxx\x{85}XYZ
  791. 0: \x{a0}xxx\x{85}
  792. /\S \S/I,utf,tables=2
  793. Capture group count = 0
  794. Options: utf
  795. Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0e \x0f
  796. \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e
  797. \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C
  798. D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e f g h
  799. i j k l m n o p q r s t u v w x y z { | } ~ \x7f \xc0 \xc1 \xc2 \xc3 \xc4
  800. \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3
  801. \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2
  802. \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1
  803. \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
  804. Last code unit = ' '
  805. Subject length lower bound = 3
  806. \x{a2} \x{84}
  807. 0: \x{a2} \x{84}
  808. A Z
  809. 0: A Z
  810. /a+/utf
  811. a\x{123}aa\=offset=1
  812. 0: aa
  813. a\x{123}aa\=offset=3
  814. 0: aa
  815. a\x{123}aa\=offset=4
  816. 0: a
  817. \= Expect bad offset value
  818. a\x{123}aa\=offset=6
  819. Failed: error -33: bad offset value
  820. \= Expect bad UTF-8 offset
  821. a\x{123}aa\=offset=2
  822. Error -36 (bad UTF-8 offset)
  823. \= Expect no match
  824. a\x{123}aa\=offset=5
  825. No match
  826. /\x{1234}+/Ii,utf
  827. Capture group count = 0
  828. Options: caseless utf
  829. Starting code units: \xe1
  830. Subject length lower bound = 1
  831. /\x{1234}+?/Ii,utf
  832. Capture group count = 0
  833. Options: caseless utf
  834. Starting code units: \xe1
  835. Subject length lower bound = 1
  836. /\x{1234}++/Ii,utf
  837. Capture group count = 0
  838. Options: caseless utf
  839. Starting code units: \xe1
  840. Subject length lower bound = 1
  841. /\x{1234}{2}/Ii,utf
  842. Capture group count = 0
  843. Options: caseless utf
  844. Starting code units: \xe1
  845. Subject length lower bound = 2
  846. /[^\x{c4}]/IB,utf
  847. ------------------------------------------------------------------
  848. Bra
  849. [^\x{c4}]
  850. Ket
  851. End
  852. ------------------------------------------------------------------
  853. Capture group count = 0
  854. Options: utf
  855. Subject length lower bound = 1
  856. /X+\x{200}/IB,utf
  857. ------------------------------------------------------------------
  858. Bra
  859. X++
  860. \x{200}
  861. Ket
  862. End
  863. ------------------------------------------------------------------
  864. Capture group count = 0
  865. Options: utf
  866. First code unit = 'X'
  867. Last code unit = \x80
  868. Subject length lower bound = 2
  869. /\R/I,utf
  870. Capture group count = 0
  871. Options: utf
  872. Starting code units: \x0a \x0b \x0c \x0d \xc2 \xe2
  873. Subject length lower bound = 1
  874. /\777/IB,utf
  875. ------------------------------------------------------------------
  876. Bra
  877. \x{1ff}
  878. Ket
  879. End
  880. ------------------------------------------------------------------
  881. Capture group count = 0
  882. Options: utf
  883. First code unit = \xc7
  884. Last code unit = \xbf
  885. Subject length lower bound = 1
  886. /\w+\x{C4}/B,utf
  887. ------------------------------------------------------------------
  888. Bra
  889. \w++
  890. \x{c4}
  891. Ket
  892. End
  893. ------------------------------------------------------------------
  894. a\x{C4}\x{C4}
  895. 0: a\x{c4}
  896. /\w+\x{C4}/B,utf,tables=2
  897. ------------------------------------------------------------------
  898. Bra
  899. \w+
  900. \x{c4}
  901. Ket
  902. End
  903. ------------------------------------------------------------------
  904. a\x{C4}\x{C4}
  905. 0: a\x{c4}\x{c4}
  906. /\W+\x{C4}/B,utf
  907. ------------------------------------------------------------------
  908. Bra
  909. \W+
  910. \x{c4}
  911. Ket
  912. End
  913. ------------------------------------------------------------------
  914. !\x{C4}
  915. 0: !\x{c4}
  916. /\W+\x{C4}/B,utf,tables=2
  917. ------------------------------------------------------------------
  918. Bra
  919. \W++
  920. \x{c4}
  921. Ket
  922. End
  923. ------------------------------------------------------------------
  924. !\x{C4}
  925. 0: !\x{c4}
  926. /\W+\x{A1}/B,utf
  927. ------------------------------------------------------------------
  928. Bra
  929. \W+
  930. \x{a1}
  931. Ket
  932. End
  933. ------------------------------------------------------------------
  934. !\x{A1}
  935. 0: !\x{a1}
  936. /\W+\x{A1}/B,utf,tables=2
  937. ------------------------------------------------------------------
  938. Bra
  939. \W+
  940. \x{a1}
  941. Ket
  942. End
  943. ------------------------------------------------------------------
  944. !\x{A1}
  945. 0: !\x{a1}
  946. /X\s+\x{A0}/B,utf
  947. ------------------------------------------------------------------
  948. Bra
  949. X
  950. \s++
  951. \x{a0}
  952. Ket
  953. End
  954. ------------------------------------------------------------------
  955. X\x20\x{A0}\x{A0}
  956. 0: X \x{a0}
  957. /X\s+\x{A0}/B,utf,tables=2
  958. ------------------------------------------------------------------
  959. Bra
  960. X
  961. \s+
  962. \x{a0}
  963. Ket
  964. End
  965. ------------------------------------------------------------------
  966. X\x20\x{A0}\x{A0}
  967. 0: X \x{a0}\x{a0}
  968. /\S+\x{A0}/B,utf
  969. ------------------------------------------------------------------
  970. Bra
  971. \S+
  972. \x{a0}
  973. Ket
  974. End
  975. ------------------------------------------------------------------
  976. X\x{A0}\x{A0}
  977. 0: X\x{a0}\x{a0}
  978. /\S+\x{A0}/B,utf,tables=2
  979. ------------------------------------------------------------------
  980. Bra
  981. \S++
  982. \x{a0}
  983. Ket
  984. End
  985. ------------------------------------------------------------------
  986. X\x{A0}\x{A0}
  987. 0: X\x{a0}
  988. /\x{a0}+\s!/B,utf
  989. ------------------------------------------------------------------
  990. Bra
  991. \x{a0}++
  992. \s
  993. !
  994. Ket
  995. End
  996. ------------------------------------------------------------------
  997. \x{a0}\x20!
  998. 0: \x{a0} !
  999. /\x{a0}+\s!/B,utf,tables=2
  1000. ------------------------------------------------------------------
  1001. Bra
  1002. \x{a0}+
  1003. \s
  1004. !
  1005. Ket
  1006. End
  1007. ------------------------------------------------------------------
  1008. \x{a0}\x20!
  1009. 0: \x{a0} !
  1010. /A/utf
  1011. \x{ff000041}
  1012. ** Character \x{ff000041} is greater than 0x7fffffff and so cannot be converted to UTF-8
  1013. \x{7f000041}
  1014. Failed: error -14: UTF-8 error: 6-byte character is not allowed (RFC 3629) at offset 0
  1015. /(*UTF8)abc/never_utf
  1016. Failed: error 174 at offset 7: using UTF is disabled by the application
  1017. /abc/utf,never_utf
  1018. Failed: error 174 at offset 0: using UTF is disabled by the application
  1019. /A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf
  1020. ------------------------------------------------------------------
  1021. Bra
  1022. /i A\x{391}\x{10427}\x{ff3a}\x{1fb0}
  1023. Ket
  1024. End
  1025. ------------------------------------------------------------------
  1026. Capture group count = 0
  1027. Options: caseless utf
  1028. First code unit = 'A' (caseless)
  1029. Subject length lower bound = 5
  1030. /A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf
  1031. ------------------------------------------------------------------
  1032. Bra
  1033. A\x{391}\x{10427}\x{ff3a}\x{1fb0}
  1034. Ket
  1035. End
  1036. ------------------------------------------------------------------
  1037. Capture group count = 0
  1038. Options: utf
  1039. First code unit = 'A'
  1040. Last code unit = \xb0
  1041. Subject length lower bound = 5
  1042. /AB\x{1fb0}/IB,utf
  1043. ------------------------------------------------------------------
  1044. Bra
  1045. AB\x{1fb0}
  1046. Ket
  1047. End
  1048. ------------------------------------------------------------------
  1049. Capture group count = 0
  1050. Options: utf
  1051. First code unit = 'A'
  1052. Last code unit = \xb0
  1053. Subject length lower bound = 3
  1054. /AB\x{1fb0}/IBi,utf
  1055. ------------------------------------------------------------------
  1056. Bra
  1057. /i AB\x{1fb0}
  1058. Ket
  1059. End
  1060. ------------------------------------------------------------------
  1061. Capture group count = 0
  1062. Options: caseless utf
  1063. First code unit = 'A' (caseless)
  1064. Last code unit = 'B' (caseless)
  1065. Subject length lower bound = 3
  1066. /\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf
  1067. Capture group count = 0
  1068. Options: caseless utf
  1069. Starting code units: \xd0 \xd1
  1070. Subject length lower bound = 17
  1071. \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}
  1072. 0: \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}
  1073. \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
  1074. 0: \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
  1075. /[ⱥ]/Bi,utf
  1076. ------------------------------------------------------------------
  1077. Bra
  1078. /i \x{2c65}
  1079. Ket
  1080. End
  1081. ------------------------------------------------------------------
  1082. /[^ⱥ]/Bi,utf
  1083. ------------------------------------------------------------------
  1084. Bra
  1085. /i [^\x{2c65}]
  1086. Ket
  1087. End
  1088. ------------------------------------------------------------------
  1089. /\h/I
  1090. Capture group count = 0
  1091. Starting code units: \x09 \x20 \xa0
  1092. Subject length lower bound = 1
  1093. /\v/I
  1094. Capture group count = 0
  1095. Starting code units: \x0a \x0b \x0c \x0d \x85
  1096. Subject length lower bound = 1
  1097. /\R/I
  1098. Capture group count = 0
  1099. Starting code units: \x0a \x0b \x0c \x0d \x85
  1100. Subject length lower bound = 1
  1101. /[[:blank:]]/B,ucp
  1102. ------------------------------------------------------------------
  1103. Bra
  1104. [\x09 \xa0]
  1105. Ket
  1106. End
  1107. ------------------------------------------------------------------
  1108. /\x{212a}+/Ii,utf
  1109. Capture group count = 0
  1110. Options: caseless utf
  1111. Starting code units: K k \xe2
  1112. Subject length lower bound = 1
  1113. KKkk\x{212a}
  1114. 0: KKkk\x{212a}
  1115. /s+/Ii,utf
  1116. Capture group count = 0
  1117. Options: caseless utf
  1118. Starting code units: S s \xc5
  1119. Subject length lower bound = 1
  1120. SSss\x{17f}
  1121. 0: SSss\x{17f}
  1122. /\x{100}*A/IB,utf
  1123. ------------------------------------------------------------------
  1124. Bra
  1125. \x{100}*+
  1126. A
  1127. Ket
  1128. End
  1129. ------------------------------------------------------------------
  1130. Capture group count = 0
  1131. Options: utf
  1132. Starting code units: A \xc4
  1133. Last code unit = 'A'
  1134. Subject length lower bound = 1
  1135. A
  1136. 0: A
  1137. /\x{100}*\d(?R)/IB,utf
  1138. ------------------------------------------------------------------
  1139. Bra
  1140. \x{100}*+
  1141. \d
  1142. Recurse
  1143. Ket
  1144. End
  1145. ------------------------------------------------------------------
  1146. Capture group count = 0
  1147. Options: utf
  1148. Starting code units: 0 1 2 3 4 5 6 7 8 9 \xc4
  1149. Subject length lower bound = 1
  1150. /[Z\x{100}]/IB,utf
  1151. ------------------------------------------------------------------
  1152. Bra
  1153. [Z\x{100}]
  1154. Ket
  1155. End
  1156. ------------------------------------------------------------------
  1157. Capture group count = 0
  1158. Options: utf
  1159. Starting code units: Z \xc4
  1160. Subject length lower bound = 1
  1161. Z\x{100}
  1162. 0: Z
  1163. \x{100}
  1164. 0: \x{100}
  1165. \x{100}Z
  1166. 0: \x{100}
  1167. /[z-\x{100}]/IB,utf
  1168. ------------------------------------------------------------------
  1169. Bra
  1170. [z-\xff\x{100}]
  1171. Ket
  1172. End
  1173. ------------------------------------------------------------------
  1174. Capture group count = 0
  1175. Options: utf
  1176. Starting code units: z { | } ~ \x7f \xc2 \xc3 \xc4
  1177. Subject length lower bound = 1
  1178. /[z\Qa-d]Ā\E]/IB,utf
  1179. ------------------------------------------------------------------
  1180. Bra
  1181. [\-\]adz\x{100}]
  1182. Ket
  1183. End
  1184. ------------------------------------------------------------------
  1185. Capture group count = 0
  1186. Options: utf
  1187. Starting code units: - ] a d z \xc4
  1188. Subject length lower bound = 1
  1189. \x{100}
  1190. 0: \x{100}
  1191. Ā
  1192. 0: \x{100}
  1193. /[ab\x{100}]abc(xyz(?1))/IB,utf
  1194. ------------------------------------------------------------------
  1195. Bra
  1196. [ab\x{100}]
  1197. abc
  1198. CBra 1
  1199. xyz
  1200. Recurse
  1201. Ket
  1202. Ket
  1203. End
  1204. ------------------------------------------------------------------
  1205. Capture group count = 1
  1206. Options: utf
  1207. Starting code units: a b \xc4
  1208. Last code unit = 'z'
  1209. Subject length lower bound = 7
  1210. /\x{100}*\s/IB,utf
  1211. ------------------------------------------------------------------
  1212. Bra
  1213. \x{100}*+
  1214. \s
  1215. Ket
  1216. End
  1217. ------------------------------------------------------------------
  1218. Capture group count = 0
  1219. Options: utf
  1220. Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 \xc4
  1221. Subject length lower bound = 1
  1222. /\x{100}*\d/IB,utf
  1223. ------------------------------------------------------------------
  1224. Bra
  1225. \x{100}*+
  1226. \d
  1227. Ket
  1228. End
  1229. ------------------------------------------------------------------
  1230. Capture group count = 0
  1231. Options: utf
  1232. Starting code units: 0 1 2 3 4 5 6 7 8 9 \xc4
  1233. Subject length lower bound = 1
  1234. /\x{100}*\w/IB,utf
  1235. ------------------------------------------------------------------
  1236. Bra
  1237. \x{100}*+
  1238. \w
  1239. Ket
  1240. End
  1241. ------------------------------------------------------------------
  1242. Capture group count = 0
  1243. Options: utf
  1244. Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P
  1245. Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z
  1246. \xc4
  1247. Subject length lower bound = 1
  1248. /\x{100}*\D/IB,utf
  1249. ------------------------------------------------------------------
  1250. Bra
  1251. \x{100}*
  1252. \D
  1253. Ket
  1254. End
  1255. ------------------------------------------------------------------
  1256. Capture group count = 0
  1257. Options: utf
  1258. Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
  1259. \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
  1260. \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / : ; < = >
  1261. ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c
  1262. d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \xc0 \xc1 \xc2
  1263. \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1
  1264. \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0
  1265. \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef
  1266. \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe
  1267. \xff
  1268. Subject length lower bound = 1
  1269. /\x{100}*\S/IB,utf
  1270. ------------------------------------------------------------------
  1271. Bra
  1272. \x{100}*
  1273. \S
  1274. Ket
  1275. End
  1276. ------------------------------------------------------------------
  1277. Capture group count = 0
  1278. Options: utf
  1279. Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0e \x0f
  1280. \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d \x1e
  1281. \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C
  1282. D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e f g h
  1283. i j k l m n o p q r s t u v w x y z { | } ~ \x7f \xc0 \xc1 \xc2 \xc3 \xc4
  1284. \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3
  1285. \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2
  1286. \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1
  1287. \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
  1288. Subject length lower bound = 1
  1289. /\x{100}*\W/IB,utf
  1290. ------------------------------------------------------------------
  1291. Bra
  1292. \x{100}*
  1293. \W
  1294. Ket
  1295. End
  1296. ------------------------------------------------------------------
  1297. Capture group count = 0
  1298. Options: utf
  1299. Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
  1300. \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
  1301. \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / : ; < = >
  1302. ? @ [ \ ] ^ ` { | } ~ \x7f \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9
  1303. \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8
  1304. \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7
  1305. \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6
  1306. \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
  1307. Subject length lower bound = 1
  1308. /[\x{105}-\x{109}]/IBi,utf
  1309. ------------------------------------------------------------------
  1310. Bra
  1311. [\x{104}-\x{109}]
  1312. Ket
  1313. End
  1314. ------------------------------------------------------------------
  1315. Capture group count = 0
  1316. Options: caseless utf
  1317. Starting code units: \xc4
  1318. Subject length lower bound = 1
  1319. \x{104}
  1320. 0: \x{104}
  1321. \x{105}
  1322. 0: \x{105}
  1323. \x{109}
  1324. 0: \x{109}
  1325. \= Expect no match
  1326. \x{100}
  1327. No match
  1328. \x{10a}
  1329. No match
  1330. /[z-\x{100}]/IBi,utf
  1331. ------------------------------------------------------------------
  1332. Bra
  1333. [Zz-\xff\x{39c}\x{3bc}\x{212b}\x{1e9e}\x{212b}\x{178}\x{100}-\x{101}]
  1334. Ket
  1335. End
  1336. ------------------------------------------------------------------
  1337. Capture group count = 0
  1338. Options: caseless utf
  1339. Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xce \xe1 \xe2
  1340. Subject length lower bound = 1
  1341. Z
  1342. 0: Z
  1343. z
  1344. 0: z
  1345. \x{39c}
  1346. 0: \x{39c}
  1347. \x{178}
  1348. 0: \x{178}
  1349. |
  1350. 0: |
  1351. \x{80}
  1352. 0: \x{80}
  1353. \x{ff}
  1354. 0: \x{ff}
  1355. \x{100}
  1356. 0: \x{100}
  1357. \x{101}
  1358. 0: \x{101}
  1359. \= Expect no match
  1360. \x{102}
  1361. No match
  1362. Y
  1363. No match
  1364. y
  1365. No match
  1366. /[z-\x{100}]/IBi,utf
  1367. ------------------------------------------------------------------
  1368. Bra
  1369. [Zz-\xff\x{39c}\x{3bc}\x{212b}\x{1e9e}\x{212b}\x{178}\x{100}-\x{101}]
  1370. Ket
  1371. End
  1372. ------------------------------------------------------------------
  1373. Capture group count = 0
  1374. Options: caseless utf
  1375. Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xce \xe1 \xe2
  1376. Subject length lower bound = 1
  1377. /\x{3a3}B/IBi,utf
  1378. ------------------------------------------------------------------
  1379. Bra
  1380. clist 03a3 03c2 03c3
  1381. /i B
  1382. Ket
  1383. End
  1384. ------------------------------------------------------------------
  1385. Capture group count = 0
  1386. Options: caseless utf
  1387. Starting code units: \xce \xcf
  1388. Last code unit = 'B' (caseless)
  1389. Subject length lower bound = 2
  1390. /abc/utf,replace=�
  1391. abc
  1392. Failed: error -3: UTF-8 error: 1 byte missing at end
  1393. /(?<=(a)(?-1))x/I,utf
  1394. Capture group count = 1
  1395. Max lookbehind = 2
  1396. Options: utf
  1397. First code unit = 'x'
  1398. Subject length lower bound = 1
  1399. a\x80zx\=offset=3
  1400. Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 1
  1401. /[\W\p{Any}]/B
  1402. ------------------------------------------------------------------
  1403. Bra
  1404. [\x00-/:-@[-^`{-\xff\p{Any}]
  1405. Ket
  1406. End
  1407. ------------------------------------------------------------------
  1408. abc
  1409. 0: a
  1410. 123
  1411. 0: 1
  1412. /[\W\pL]/B
  1413. ------------------------------------------------------------------
  1414. Bra
  1415. [\x00-/:-@[-^`{-\xff\p{L}]
  1416. Ket
  1417. End
  1418. ------------------------------------------------------------------
  1419. abc
  1420. 0: a
  1421. \= Expect no match
  1422. 123
  1423. No match
  1424. /(*:*++++++++++++''''''''''''''''''''+''+++'+++x+++++++++++++++++++++++++++++++++++(++++++++++++++++++++:++++++%++:''''''''''''''''''''''''+++++++++++++++++++++++++++++++++++++++++++++++++++++-++++++++k+++++++''''+++'+++++++++++++++++++++++''''++++++++++++':ƿ)/utf
  1425. Failed: error 176 at offset 259: name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)
  1426. /[\s[:^ascii:]]/B,ucp
  1427. ------------------------------------------------------------------
  1428. Bra
  1429. [\x80-\xff\p{Xsp}]
  1430. Ket
  1431. End
  1432. ------------------------------------------------------------------
  1433. # A special extra option allows excaped surrogate code points in 8-bit mode,
  1434. # but subjects containing them must not be UTF-checked.
  1435. /\x{d800}/I,utf,allow_surrogate_escapes
  1436. Capture group count = 0
  1437. Options: utf
  1438. Extra options: allow_surrogate_escapes
  1439. First code unit = \xed
  1440. Last code unit = \x80
  1441. Subject length lower bound = 1
  1442. \x{d800}\=no_utf_check
  1443. 0: \x{d800}
  1444. /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
  1445. \x{dfff}\x{df01}\=no_utf_check
  1446. 0: \x{dfff}\x{df01}
  1447. # This has different starting code units in 8-bit mode.
  1448. /^[^ab]/IB,utf
  1449. ------------------------------------------------------------------
  1450. Bra
  1451. ^
  1452. [\x00-`c-\xff] (neg)
  1453. Ket
  1454. End
  1455. ------------------------------------------------------------------
  1456. Capture group count = 0
  1457. Compile options: utf
  1458. Overall options: anchored utf
  1459. Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
  1460. \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
  1461. \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4
  1462. 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y
  1463. Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f
  1464. \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0
  1465. \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf
  1466. \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee
  1467. \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd
  1468. \xfe \xff
  1469. Subject length lower bound = 1
  1470. c
  1471. 0: c
  1472. \x{ff}
  1473. 0: \x{ff}
  1474. \x{100}
  1475. 0: \x{100}
  1476. \= Expect no match
  1477. aaa
  1478. No match
  1479. # Offsets are different in 8-bit mode.
  1480. /(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
  1481. 123abcáyzabcdef789abcሴqr
  1482. 1(2) Old 6 6 "" New 6 8 "<>"
  1483. 2(2) Old 13 13 "" New 15 17 "<>"
  1484. 3(2) Old 13 16 "def" New 17 22 "<def>"
  1485. 4(2) Old 22 22 "" New 28 30 "<>"
  1486. 4: 123abc<>\x{e1}yzabc<><def>789abc<>\x{1234}qr
  1487. # Check name length with non-ASCII characters
  1488. /(?'ABáC678901234567890123456789012012345678901234567890123456789AB012345678901234567890123456789AB012345678901234567890123456789AB'...)/utf
  1489. /(?'ABáC6789012345678901234567890123012345678901234567890123456789AB012345678901234567890123456789AB012345678901234567890123456789AB'...)/utf
  1490. Failed: error 148 at offset 132: subpattern name is too long (maximum 128 code units)
  1491. /(?'ABZC6789012345678901234567890123012345678901234567890123456789AB012345678901234567890123456789AB012345678901234567890123456789AB'...)/utf
  1492. /(?(n/utf
  1493. Failed: error 142 at offset 4: syntax error in subpattern name (missing terminator?)
  1494. /(?(á/utf
  1495. Failed: error 142 at offset 5: syntax error in subpattern name (missing terminator?)
  1496. # Invalid UTF-8 tests
  1497. /.../g,match_invalid_utf
  1498. abcd\x80wxzy\x80pqrs
  1499. 0: abc
  1500. 0: wxz
  1501. 0: pqr
  1502. abcd\x{80}wxzy\x80pqrs
  1503. 0: abc
  1504. 0: d\x{80}w
  1505. 0: xzy
  1506. 0: pqr
  1507. /abc/match_invalid_utf
  1508. ab\x80ab\=ph
  1509. Partial match: ab
  1510. \= Expect no match
  1511. ab\x80cdef\=ph
  1512. No match
  1513. /.a/match_invalid_utf
  1514. ab\=ph
  1515. Partial match: b
  1516. ab\=ps
  1517. Partial match: b
  1518. b\xf0\x91\x88b\=ph
  1519. Partial match: b
  1520. b\xf0\x91\x88b\=ps
  1521. Partial match: b
  1522. b\xf0\x91\x88\xb4a
  1523. 0: \x{11234}a
  1524. \= Expect no match
  1525. b\x80\=ph
  1526. No match
  1527. b\x80\=ps
  1528. No match
  1529. b\xf0\x91\x88\=ph
  1530. No match
  1531. b\xf0\x91\x88\=ps
  1532. No match
  1533. /.a$/match_invalid_utf
  1534. ab\=ph
  1535. Partial match: b
  1536. ab\=ps
  1537. Partial match: b
  1538. \= Expect no match
  1539. b\xf0\x91\x98\=ph
  1540. No match
  1541. b\xf0\x91\x98\=ps
  1542. No match
  1543. /ab$/match_invalid_utf
  1544. ab\x80cdeab
  1545. 0: ab
  1546. \= Expect no match
  1547. ab\x80cde
  1548. No match
  1549. /.../g,match_invalid_utf
  1550. abcd\x{80}wxzy\x80pqrs
  1551. 0: abc
  1552. 0: d\x{80}w
  1553. 0: xzy
  1554. 0: pqr
  1555. /(?<=x)../g,match_invalid_utf
  1556. abcd\x{80}wxzy\x80pqrs
  1557. 0: zy
  1558. abcd\x{80}wxzy\x80xpqrs
  1559. 0: zy
  1560. 0: pq
  1561. /X$/match_invalid_utf
  1562. \= Expect no match
  1563. X\xc4
  1564. No match
  1565. /(?<=..)X/match_invalid_utf,aftertext
  1566. AB\x80AQXYZ
  1567. 0: X
  1568. 0+ YZ
  1569. AB\x80AQXYZ\=offset=5
  1570. 0: X
  1571. 0+ YZ
  1572. AB\x80\x80AXYZXC\=offset=5
  1573. 0: X
  1574. 0+ C
  1575. \= Expect no match
  1576. AB\x80XYZ
  1577. No match
  1578. AB\x80XYZ\=offset=3
  1579. No match
  1580. AB\xfeXYZ
  1581. No match
  1582. AB\xffXYZ\=offset=3
  1583. No match
  1584. AB\x80AXYZ
  1585. No match
  1586. AB\x80AXYZ\=offset=4
  1587. No match
  1588. AB\x80\x80AXYZ\=offset=5
  1589. No match
  1590. /.../match_invalid_utf
  1591. AB\xc4CCC
  1592. 0: CCC
  1593. \= Expect no match
  1594. A\x{d800}B
  1595. No match
  1596. A\x{110000}B
  1597. No match
  1598. A\xc4B
  1599. No match
  1600. /\bX/match_invalid_utf
  1601. A\x80X
  1602. 0: X
  1603. /\BX/match_invalid_utf
  1604. \= Expect no match
  1605. A\x80X
  1606. No match
  1607. /(?<=...)X/match_invalid_utf
  1608. AAA\x80BBBXYZ
  1609. 0: X
  1610. \= Expect no match
  1611. AAA\x80BXYZ
  1612. No match
  1613. AAA\x80BBXYZ
  1614. No match
  1615. # -------------------------------------
  1616. /(*UTF)(?=\x{123})/I
  1617. Capture group count = 0
  1618. May match empty string
  1619. Compile options: <none>
  1620. Overall options: utf
  1621. First code unit = \xc4
  1622. Last code unit = \xa3
  1623. Subject length lower bound = 1
  1624. /[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
  1625. Capture group count = 0
  1626. Options: utf
  1627. Starting code units: \xc3
  1628. Last code unit = 'X'
  1629. Subject length lower bound = 3
  1630. /[󿾟,]/BI,utf
  1631. ------------------------------------------------------------------
  1632. Bra
  1633. [,\x{fff9f}]
  1634. Ket
  1635. End
  1636. ------------------------------------------------------------------
  1637. Capture group count = 0
  1638. Options: utf
  1639. Starting code units: , \xf3
  1640. Subject length lower bound = 1
  1641. /[\x{fff4}-\x{ffff8}]/I,utf
  1642. Capture group count = 0
  1643. Options: utf
  1644. Starting code units: \xef \xf0 \xf1 \xf2 \xf3
  1645. Subject length lower bound = 1
  1646. /[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf
  1647. Capture group count = 0
  1648. Options: utf
  1649. Starting code units: \xef \xf0 \xf1 \xf2 \xf4
  1650. Subject length lower bound = 1
  1651. /[\xff\x{ffff}]/I,utf
  1652. Capture group count = 0
  1653. Options: utf
  1654. Starting code units: \xc3 \xef
  1655. Subject length lower bound = 1
  1656. /[\xff\x{ff}]/I,utf
  1657. Capture group count = 0
  1658. Options: utf
  1659. Starting code units: \xc3
  1660. Subject length lower bound = 1
  1661. abc\x{ff}def
  1662. 0: \x{ff}
  1663. /[\xff\x{ff}]/I
  1664. Capture group count = 0
  1665. First code unit = \xff
  1666. Subject length lower bound = 1
  1667. abc\x{ff}def
  1668. 0: \xff
  1669. /[Ss]/I
  1670. Capture group count = 0
  1671. First code unit = 'S' (caseless)
  1672. Subject length lower bound = 1
  1673. /[Ss]/I,utf
  1674. Capture group count = 0
  1675. Options: utf
  1676. Starting code units: S s
  1677. Subject length lower bound = 1
  1678. /(?:\x{ff}|\x{3000})/I,utf
  1679. Capture group count = 0
  1680. Options: utf
  1681. Starting code units: \xc3 \xe3
  1682. Subject length lower bound = 1
  1683. /x/utf
  1684. abxyz
  1685. 0: x
  1686. \x80\=startchar
  1687. Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 0
  1688. abc\x80\=startchar
  1689. Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 3
  1690. abc\x80\=startchar,offset=3
  1691. Error -36 (bad UTF-8 offset)
  1692. /\x{c1}+\x{e1}/iIB,ucp
  1693. ------------------------------------------------------------------
  1694. Bra
  1695. /i \x{c1}+
  1696. /i \x{e1}
  1697. Ket
  1698. End
  1699. ------------------------------------------------------------------
  1700. Capture group count = 0
  1701. Options: caseless ucp
  1702. First code unit = \xc1 (caseless)
  1703. Last code unit = \xe1 (caseless)
  1704. Subject length lower bound = 2
  1705. \x{c1}\x{c1}\x{c1}
  1706. 0: \xc1\xc1\xc1
  1707. \x{e1}\x{e1}\x{e1}
  1708. 0: \xe1\xe1\xe1
  1709. /a|\x{c1}/iI,ucp
  1710. Capture group count = 0
  1711. Options: caseless ucp
  1712. Starting code units: A a \xc1 \xe1
  1713. Subject length lower bound = 1
  1714. \x{e1}xxx
  1715. 0: \xe1
  1716. /a|\x{c1}/iI,utf
  1717. Capture group count = 0
  1718. Options: caseless utf
  1719. Starting code units: A a \xc3
  1720. Subject length lower bound = 1
  1721. \x{e1}xxx
  1722. 0: \x{e1}
  1723. /\x{c1}|\x{e1}/iI,ucp
  1724. Capture group count = 0
  1725. Options: caseless ucp
  1726. First code unit = \xc1 (caseless)
  1727. Subject length lower bound = 1
  1728. /X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
  1729. X\x{e1}Y
  1730. 1: >\xc1<
  1731. /X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended
  1732. X\x{c1}Y
  1733. 1: >\xe1<
  1734. # Without UTF or UCP characters > 127 have only one case in the default locale.
  1735. /X(\x{e1})Y/replace=>\U$1<,substitute_extended
  1736. X\x{e1}Y
  1737. 1: >\xe1<
  1738. /A/utf,match_invalid_utf,caseless
  1739. \xe5A
  1740. 0: A
  1741. /\bch\b/utf,match_invalid_utf
  1742. qchq\=ph
  1743. Partial match:
  1744. qchq\=ps
  1745. Partial match:
  1746. /line1\nbreak/firstline,utf,match_invalid_utf
  1747. line1\nbreak
  1748. 0: line1\x{0a}break
  1749. line0\nline1\nbreak
  1750. No match
  1751. /A\z/utf,match_invalid_utf
  1752. A\x80\x42\n
  1753. No match
  1754. # End of testinput10