Personal emacs config
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3158 lines
115 KiB

  1. ;;; pcre2el.el --- regexp syntax converter -*- lexical-binding: t -*-
  2. ;; Copyright (C) 2012-2015 Jon Oddie <jonxfield@gmail.com>
  3. ;; Author: joddie <jonxfield at gmail.com>
  4. ;; Hacked additionally by: opensource at hardakers dot net
  5. ;; Created: 14 Feb 2012
  6. ;; Updated: 13 December 2015
  7. ;; Version: 1.8
  8. ;; Package-Version: 1.8
  9. ;; Package-Commit: 166a10472002010692dbc35f323ffb8110a294c5
  10. ;; Url: https://github.com/joddie/pcre2el
  11. ;; Package-Requires: ((emacs "24") (cl-lib "0.3"))
  12. ;; This file is NOT part of GNU Emacs.
  13. ;; This program is free software: you can redistribute it and/or
  14. ;; modify it under the terms of the GNU General Public License as
  15. ;; published by the Free Software Foundation, either version 3 of the
  16. ;; License, or (at your option) any later version.
  17. ;;
  18. ;; This program is distributed in the hope that it will be useful, but
  19. ;; WITHOUT ANY WARRANTY; without even the implied warranty of
  20. ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  21. ;; General Public License for more details.
  22. ;;
  23. ;; You should have received a copy of the GNU General Public License
  24. ;; along with this program. If not, see `http://www.gnu.org/licenses/'.
  25. ;; This file incorporates work covered by the following copyright and
  26. ;; permission notice:
  27. ;;
  28. ;; Copyright (c) 1993-2002 Richard Kelsey and Jonathan Rees
  29. ;; Copyright (c) 1994-2002 by Olin Shivers and Brian D. Carlstrom.
  30. ;; Copyright (c) 1999-2002 by Martin Gasbichler.
  31. ;; Copyright (c) 2001-2002 by Michael Sperber.
  32. ;; All rights reserved.
  33. ;;
  34. ;; Redistribution and use in source and binary forms, with or without
  35. ;; modification, are permitted provided that the following conditions
  36. ;; are met: 1. Redistributions of source code must retain the above
  37. ;; copyright notice, this list of conditions and the following
  38. ;; disclaimer. 2. Redistributions in binary form must reproduce the
  39. ;; above copyright notice, this list of conditions and the following
  40. ;; disclaimer in the documentation and/or other materials provided
  41. ;; with the distribution. 3. The name of the authors may not be used
  42. ;; to endorse or promote products derived from this software without
  43. ;; specific prior written permission.
  44. ;;
  45. ;; THIS SOFTWARE IS PROVIDED BY THE AUTHORS "AS IS" AND ANY EXPRESS OR
  46. ;; IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  47. ;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  48. ;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
  49. ;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  50. ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
  51. ;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  52. ;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  53. ;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  54. ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  55. ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  56. ;;; Commentary:
  57. ;; 1 Overview
  58. ;; ==========
  59. ;; `pcre2el' or `rxt' (RegeXp Translator or RegeXp Tools) is a utility
  60. ;; for working with regular expressions in Emacs, based on a
  61. ;; recursive-descent parser for regexp syntax. In addition to converting
  62. ;; (a subset of) PCRE syntax into its Emacs equivalent, it can do the
  63. ;; following:
  64. ;; - convert Emacs syntax to PCRE
  65. ;; - convert either syntax to `rx', an S-expression based regexp syntax
  66. ;; - untangle complex regexps by showing the parse tree in `rx' form and
  67. ;; highlighting the corresponding chunks of code
  68. ;; - show the complete list of strings (productions) matching a regexp,
  69. ;; provided the list is finite
  70. ;; - provide live font-locking of regexp syntax (so far only for Elisp
  71. ;; buffers -- other modes on the TODO list)
  72. ;; 2 Usage
  73. ;; =======
  74. ;; Enable `rxt-mode' or its global equivalent `rxt-global-mode' to get
  75. ;; the default key-bindings. There are three sets of commands: commands
  76. ;; that take a PCRE regexp, commands which take an Emacs regexp, and
  77. ;; commands that try to do the right thing based on the current
  78. ;; mode. Currently, this means Emacs syntax in `emacs-lisp-mode' and
  79. ;; `lisp-interaction-mode', and PCRE syntax everywhere else.
  80. ;; The default key bindings all begin with `C-c /' and have a mnemonic
  81. ;; structure: `C-c / <source> <target>', or just `C-c / <target>' for the
  82. ;; "do what I mean" commands. The complete list of key bindings is given
  83. ;; here and explained in more detail below:
  84. ;; - "Do-what-I-mean" commands:
  85. ;; `C-c / /': `rxt-explain'
  86. ;; `C-c / c': `rxt-convert-syntax'
  87. ;; `C-c / x': `rxt-convert-to-rx'
  88. ;; `C-c / '': `rxt-convert-to-strings'
  89. ;; - Commands that work on a PCRE regexp:
  90. ;; `C-c / p e': `rxt-pcre-to-elisp'
  91. ;; `C-c / %': `pcre-query-replace-regexp'
  92. ;; `C-c / p x': `rxt-pcre-to-rx'
  93. ;; `C-c / p '': `rxt-pcre-to-strings'
  94. ;; `C-c / p /': `rxt-explain-pcre'
  95. ;; - Commands that work on an Emacs regexp:
  96. ;; `C-c / e /': `rxt-explain-elisp'
  97. ;; `C-c / e p': `rxt-elisp-to-pcre'
  98. ;; `C-c / e x': `rxt-elisp-to-rx'
  99. ;; `C-c / e '': `rxt-elisp-to-strings'
  100. ;; `C-c / e t': `rxt-toggle-elisp-rx'
  101. ;; `C-c / t': `rxt-toggle-elisp-rx'
  102. ;; 2.1 Interactive input and output
  103. ;; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  104. ;; When used interactively, the conversion commands can read a regexp
  105. ;; either from the current buffer or from the minibuffer. The output is
  106. ;; displayed in the minibuffer and copied to the kill-ring.
  107. ;; - When called with a prefix argument (`C-u'), they read a regular
  108. ;; expression from the minibuffer literally, without further processing
  109. ;; -- meaning there's no need to double the backslashes if it's an
  110. ;; Emacs regexp. This is the same way commands like
  111. ;; `query-replace-regexp' read input.
  112. ;; - When the region is active, they use they the region contents, again
  113. ;; literally (without any translation of string syntax).
  114. ;; - With neither a prefix arg nor an active region, the behavior depends
  115. ;; on whether the command expects an Emacs regexp or a PCRE one.
  116. ;; Commands that take an Emacs regexp behave like `C-x C-e': they
  117. ;; evaluate the sexp before point (which could be simply a string
  118. ;; literal) and use its value. This is designed for use in Elisp
  119. ;; buffers. As a special case, if point is *inside* a string, it's
  120. ;; first moved to the string end, so in practice they should work as
  121. ;; long as point is somewhere within the regexp literal.
  122. ;; Commands that take a PCRE regexp try to read a Perl-style delimited
  123. ;; regex literal *after* point in the current buffer, including its
  124. ;; flags. For example, putting point before the `m' in the following
  125. ;; example and doing `C-c / p e' (`rxt-pcre-to-elisp') displays
  126. ;; `\(?:bar\|foo\)', correctly stripping out the whitespace and
  127. ;; comment:
  128. ;; ,----
  129. ;; | $x =~ m/ foo | (?# comment) bar /x
  130. ;; `----
  131. ;; The PCRE reader currently only works with `/ ... /' delimiters. It
  132. ;; will ignore any preceding `m', `s', or `qr' operator, as well as the
  133. ;; replacement part of an `s' construction.
  134. ;; Readers for other PCRE-using languages are on the TODO list.
  135. ;; The translation functions display their result in the minibuffer and
  136. ;; copy it to the kill ring. When translating something into Elisp
  137. ;; syntax, you might need to use the result either literally (e.g. for
  138. ;; interactive input to a command like `query-replace-regexp'), or as a
  139. ;; string to paste into Lisp code. To allow both uses,
  140. ;; `rxt-pcre-to-elisp' copies both versions successively to the
  141. ;; kill-ring. The literal regexp without string quoting is the top
  142. ;; element of the kill-ring, while the Lisp string is the
  143. ;; second-from-top. You can paste the literal regexp somewhere by doing
  144. ;; `C-y', or the Lisp string by `C-y M-y'.
  145. ;; 2.2 Syntax conversion commands
  146. ;; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  147. ;; `rxt-convert-syntax' (`C-c / c') converts between Emacs and PCRE
  148. ;; syntax, depending on the major mode in effect when called.
  149. ;; Alternatively, you can specify the conversion direction explicitly by
  150. ;; using either `rxt-pcre-to-elisp' (`C-c / p e') or `rxt-elisp-to-pcre'
  151. ;; (`C-c / e p').
  152. ;; Similarly, `rxt-convert-to-rx' (`C-c / x') converts either kind of
  153. ;; syntax to `rx' form, while `rxt-convert-pcre-to-rx' (`C-c / p x') and
  154. ;; `rxt-convert-elisp-to-rx' (`C-c / e x') convert to `rx' from a
  155. ;; specified source type.
  156. ;; In Elisp buffers, you can use `rxt-toggle-elisp-rx' (`C-c / t' or `C-c
  157. ;; / e t') to switch the regexp at point back and forth between string
  158. ;; and `rx' syntax. Point should either be within an `rx' or
  159. ;; `rx-to-string' form or a string literal for this to work.
  160. ;; 2.3 PCRE mode (experimental)
  161. ;; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  162. ;; If you want to use emulated PCRE regexp syntax in all Emacs commands,
  163. ;; try `pcre-mode', which uses Emacs's advice system to make all commands
  164. ;; that read regexps using the minibuffer use emulated PCRE syntax. It
  165. ;; should also work with Isearch.
  166. ;; This feature is still fairly experimental. It may fail to work or do
  167. ;; the wrong thing with certain commands. Please report bugs.
  168. ;; `pcre-query-replace-regexp' was originally defined to do query-replace
  169. ;; using emulated PCRE regexps, and is now made somewhat obsolete by
  170. ;; `pcre-mode'. It is bound to `C-c / %' by default, by analogy with
  171. ;; `M-%'. Put the following in your `.emacs' if you want to use
  172. ;; PCRE-style query replacement everywhere:
  173. ;; ,----
  174. ;; | (global-set-key [(meta %)] 'pcre-query-replace-regexp)
  175. ;; `----
  176. ;; 2.5 Explain regexps
  177. ;; ~~~~~~~~~~~~~~~~~~~
  178. ;; When syntax-highlighting isn't enough to untangle some gnarly regexp
  179. ;; you find in the wild, try the 'explain' commands: `rxt-explain' (`C-c
  180. ;; / /'), `rxt-explain-pcre' (`C-c / p') and `rxt-explain-elisp' (`C-c /
  181. ;; e'). These display the original regexp along with its pretty-printed
  182. ;; `rx' equivalent in a new buffer. Moving point around either in the
  183. ;; original regexp or the `rx' translation highlights corresponding
  184. ;; pieces of syntax, which can aid in seeing things like the scope of
  185. ;; quantifiers.
  186. ;; I call them "explain" commands because the `rx' form is close to a
  187. ;; plain syntax tree, and this plus the wordiness of the operators
  188. ;; usually helps to clarify what is going on. People who dislike Lisp
  189. ;; syntax might disagree with this assessment.
  190. ;; 2.6 Generate all matching strings (productions)
  191. ;; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  192. ;; Occasionally you come across a regexp which is designed to match a
  193. ;; finite set of strings, e.g. a set of keywords, and it would be useful
  194. ;; to recover the original set. (In Emacs you can generate such regexps
  195. ;; using `regexp-opt'). The commands `rxt-convert-to-strings' (`C-c /
  196. ;; ′'), `rxt-pcre-to-strings' (`C-c / p ′') or `rxt-elisp-to-strings'
  197. ;; (`C-c / e ′') accomplish this by generating all the matching strings
  198. ;; ("productions") of a regexp. (The productions are copied to the kill
  199. ;; ring as a Lisp list).
  200. ;; An example in Lisp code:
  201. ;; ,----
  202. ;; | (regexp-opt '("cat" "caterpillar" "catatonic"))
  203. ;; | ;; => "\\(?:cat\\(?:atonic\\|erpillar\\)?\\)"
  204. ;; | (rxt-elisp-to-strings "\\(?:cat\\(?:atonic\\|erpillar\\)?\\)")
  205. ;; | ;; => '("cat" "caterpillar" "catatonic")
  206. ;; `----
  207. ;; For obvious reasons, these commands only work with regexps that don't
  208. ;; include any unbounded quantifiers like `+' or `*'. They also can't
  209. ;; enumerate all the characters that match a named character class like
  210. ;; `[[:alnum:]]'. In either case they will give a (hopefully meaningful)
  211. ;; error message. Due to the nature of permutations, it's still possible
  212. ;; for a finite regexp to generate a huge number of productions, which
  213. ;; will eat memory and slow down your Emacs. Be ready with `C-g' if
  214. ;; necessary.
  215. ;; 2.7 RE-Builder support
  216. ;; ~~~~~~~~~~~~~~~~~~~~~~
  217. ;; The Emacs RE-Builder is a useful visual tool which allows using
  218. ;; several different built-in syntaxes via `reb-change-syntax' (`C-c
  219. ;; TAB'). It supports Elisp read and literal syntax and `rx', but it can
  220. ;; only convert from the symbolic forms to Elisp, not the other way. This
  221. ;; package hacks the RE-Builder to also work with emulated PCRE syntax,
  222. ;; and to convert transparently between Elisp, PCRE and rx syntaxes. PCRE
  223. ;; mode reads a delimited Perl-like literal of the form `/ ... /', and it
  224. ;; should correctly support using the `x' and `s' flags.
  225. ;; 2.8 Use from Lisp
  226. ;; ~~~~~~~~~~~~~~~~~
  227. ;; Example of using the conversion functions:
  228. ;; ,----
  229. ;; | (rxt-pcre-to-elisp "(abc|def)\\w+\\d+")
  230. ;; | ;; => "\\(\\(?:abc\\|def\\)\\)[_[:alnum:]]+[[:digit:]]+"
  231. ;; `----
  232. ;; All the conversion functions take a single string argument, the regexp
  233. ;; to translate:
  234. ;; - `rxt-pcre-to-elisp'
  235. ;; - `rxt-pcre-to-rx'
  236. ;; - `rxt-pcre-to-strings'
  237. ;; - `rxt-elisp-to-pcre'
  238. ;; - `rxt-elisp-to-rx'
  239. ;; - `rxt-elisp-to-strings'
  240. ;; 3 Bugs and Limitations
  241. ;; ======================
  242. ;; 3.1 Limitations on PCRE syntax
  243. ;; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  244. ;; PCRE has a complicated syntax and semantics, only some of which can be
  245. ;; translated into Elisp. The following subset of PCRE should be
  246. ;; correctly parsed and converted:
  247. ;; - parenthesis grouping `( .. )', including shy matches `(?: ... )'
  248. ;; - backreferences (various syntaxes), but only up to 9 per expression
  249. ;; - alternation `|'
  250. ;; - greedy and non-greedy quantifiers `*', `*?', `+', `+?', `?' and `??'
  251. ;; (all of which are the same in Elisp as in PCRE)
  252. ;; - numerical quantifiers `{M,N}'
  253. ;; - beginning/end of string `\A', `\Z'
  254. ;; - string quoting `\Q .. \E'
  255. ;; - word boundaries `\b', `\B' (these are the same in Elisp)
  256. ;; - single character escapes `\a', `\c', `\e', `\f', `\n', `\r', `\t',
  257. ;; `\x', and `\octal digits' (but see below about non-ASCII characters)
  258. ;; - character classes `[...]' including Posix escapes
  259. ;; - character classes `\d', `\D', `\h', `\H', `\s', `\S', `\v', `\V'
  260. ;; both within character class brackets and outside
  261. ;; - word and non-word characters `\w' and `\W' (Emacs has the same
  262. ;; syntax, but its meaning is different)
  263. ;; - `s' (single line) and `x' (extended syntax) flags, in regexp
  264. ;; literals, or set within the expression via `(?xs-xs)' or `(?xs-xs:
  265. ;; .... )' syntax
  266. ;; - comments `(?# ... )'
  267. ;; Most of the more esoteric PCRE features can't really be supported by
  268. ;; simple translation to Elisp regexps. These include the different
  269. ;; lookaround assertions, conditionals, and the "backtracking control
  270. ;; verbs" `(* ...)' . OTOH, there are a few other syntaxes which are
  271. ;; currently unsupported and possibly could be:
  272. ;; - `\L', `\U', `\l', `\u' case modifiers
  273. ;; - `\g{...}' backreferences
  274. ;; 3.2 Other limitations
  275. ;; ~~~~~~~~~~~~~~~~~~~~~
  276. ;; - The order of alternatives and characters in char classes sometimes
  277. ;; gets shifted around, which is annoying.
  278. ;; - Although the string parser tries to interpret PCRE's octal and
  279. ;; hexadecimal escapes correctly, there are problems with matching
  280. ;; 8-bit characters that I don't use enough to properly understand,
  281. ;; e.g.:
  282. ;; ,----
  283. ;; | (string-match-p (rxt-pcre-to-elisp "\\377") "\377") => nil
  284. ;; `----
  285. ;; A fix for this would be welcome.
  286. ;; - Most of PCRE's rules for how `^', `\A', `$' and `\Z' interact with
  287. ;; newlines are not implemented, since they seem less relevant to
  288. ;; Emacs's buffer-oriented rather than line-oriented model. However,
  289. ;; the different meanings of the `.' metacharacter *are* implemented
  290. ;; (it matches newlines with the `/s' flag, but not otherwise).
  291. ;; - Not currently namespace clean (both `rxt-' and a couple of `pcre-'
  292. ;; functions).
  293. ;; 3.3 TODO:
  294. ;; ~~~~~~~~~
  295. ;; - Python-specific extensions to PCRE?
  296. ;; - Language-specific stuff to enable regexp font-locking and explaining
  297. ;; in different modes. Each language would need two functions, which
  298. ;; could be kept in an alist:
  299. ;; 1. A function to read PCRE regexps, taking the string syntax into
  300. ;; account. E.g., Python has single-quoted, double-quoted and raw
  301. ;; strings, each with different quoting rules. PHP has the kind of
  302. ;; belt-and-suspenders solution you would expect: regexps are in
  303. ;; strings, /and/ you have to include the `/ ... /' delimiters!
  304. ;; Duh.
  305. ;; 2. A function to copy faces back from the parsed string to the
  306. ;; original buffer text. This has to recognize any escape sequences
  307. ;; so they can be treated as a single character.
  308. ;; 4 Internal details
  309. ;; ==================
  310. ;; `rxt' defines an internal syntax tree representation of regular
  311. ;; expressions, parsers for Elisp and PCRE syntax, and 'unparsers'
  312. ;; to convert the internal representation to PCRE or `rx' syntax.
  313. ;; Converting from the internal representation to Emacs syntax is
  314. ;; done by converting to `rx' form and passing it to `rx-to-string'.
  315. ;; See `rxt-parse-re', `rxt-adt->pcre', and `rxt-adt->rx' for
  316. ;; details.
  317. ;; This code is partially based on Olin Shivers' reference SRE
  318. ;; implementation in scsh, although it is simplified in some respects and
  319. ;; extended in others. See `scsh/re.scm', `scsh/spencer.scm' and
  320. ;; `scsh/posixstr.scm' in the `scsh' source tree for details. In
  321. ;; particular, `pcre2el' steals the idea of an abstract data type for
  322. ;; regular expressions and the general structure of the string regexp
  323. ;; parser and unparser. The data types for character sets are extended in
  324. ;; order to support symbolic translation between character set
  325. ;; expressions without assuming a small (Latin1) character set. The
  326. ;; string parser is also extended to parse a bigger variety of
  327. ;; constructions, including POSIX character classes and various Emacs and
  328. ;; Perl regexp assertions. Otherwise, only the bare minimum of scsh's
  329. ;; abstract data type is implemented.
  330. ;; 5 Soapbox
  331. ;; =========
  332. ;; Emacs regexps have their annoyances, but it is worth getting used to
  333. ;; them. The Emacs assertions for word boundaries, symbol boundaries, and
  334. ;; syntax classes depending on the syntax of the mode in effect are
  335. ;; especially useful. (PCRE has `\b' for word-boundary, but AFAIK it
  336. ;; doesn't have separate assertions for beginning-of-word and
  337. ;; end-of-word). Other things that might be done with huge regexps in
  338. ;; other languages can be expressed more understandably in Elisp using
  339. ;; combinations of `save-excursion' with the various searches (regexp,
  340. ;; literal, skip-syntax-forward, sexp-movement functions, etc.).
  341. ;; There's not much point in using `rxt-pcre-to-elisp' to use PCRE
  342. ;; notation in a Lisp program you're going to maintain, since you still
  343. ;; have to double all the backslashes. Better to just use the converted
  344. ;; result (or better yet, the `rx' form).
  345. ;; 6 History and acknowledgments
  346. ;; =============================
  347. ;; This was originally created out of an answer to a stackoverflow
  348. ;; question:
  349. ;; [http://stackoverflow.com/questions/9118183/elisp-mechanism-for-converting-pcre-regexps-to-emacs-regexps]
  350. ;; Thanks to:
  351. ;; - Wes Hardaker (hardaker) for the initial inspiration and subsequent
  352. ;; hacking
  353. ;; - priyadarshan for requesting RX support
  354. ;; - Daniel Colascione (dcolascione) for a patch to support Emacs's
  355. ;; explicitly-numbered match groups
  356. ;; - Aaron Meurer (asmeurer) for requesting Isearch support
  357. ;; - Philippe Vaucher (silex) for a patch to support `ibuffer-do-replace-regexp'
  358. ;; in PCRE mode
  359. ;;; Code:
  360. (require 'cl-lib)
  361. (require 'rx)
  362. (require 're-builder)
  363. (require 'advice)
  364. (require 'ring)
  365. (require 'pcase)
  366. ;;; Customization group
  367. (defgroup rxt nil
  368. "Regex syntax converter and utilities."
  369. :version 1.2
  370. :group 'tools
  371. :group 'lisp
  372. :link '(emacs-commentary-link :tag "commentary" "pcre2el.el")
  373. :link '(emacs-library-link :tag "lisp file" "pcre2el.el")
  374. :link '(url-link :tag "web page" "https://github.com/joddie/pcre2el"))
  375. (defface rxt-highlight-face
  376. '((((min-colors 16581375) (background light)) :background "#eee8d5")
  377. (((min-colors 16581375) (background dark)) :background "#222222"))
  378. "Face for highlighting corresponding regex syntax in `rxt-explain' buffers."
  379. :group 'rxt)
  380. (defcustom rxt-verbose-rx-translation nil
  381. "Non-nil if `rxt-pcre-to-rx' and `rxt-elisp-to-rx' should use verbose `rx' primitives.
  382. Verbose primitives are things like `line-start' instead of `bol',
  383. etc."
  384. :group 'rxt
  385. :type 'boolean)
  386. (defcustom rxt-explain-verbosely t
  387. "Non-nil if `rxt-explain-elisp' and `rxt-explain-pcre' should use verbose `rx' primitives.
  388. This overrides the value of `rxt-verbose-rx-translation' for
  389. these commands only."
  390. :group 'rxt
  391. :type 'boolean)
  392. ;;;; Macros and functions for writing interactive input and output
  393. ;; Macros for handling return values. If called interactively,
  394. ;; display the value in the echo area and copy it to the kill ring,
  395. ;; otherwise just return the value. PCREs are copied as unquoted
  396. ;; strings for yanking into Perl, JS, etc. `rx' forms and other sexps
  397. ;; are copied as `read'-able literals for yanking into Elisp buffers.
  398. ;; Emacs regexps are copied twice: once as an unquoted value for
  399. ;; interactive use, and once as a readable string literal for yanking
  400. ;; into Elisp buffers.
  401. (defmacro rxt-return-pcre (expr)
  402. (let ((value (make-symbol "value")))
  403. `(let ((,value ,expr))
  404. (when (called-interactively-p 'any)
  405. (rxt--kill-pcre ,value))
  406. ,value)))
  407. (defmacro rxt-return-sexp (expr)
  408. (let ((value (make-symbol "value")))
  409. `(let ((,value ,expr))
  410. (when (called-interactively-p 'any)
  411. (rxt--kill-sexp ,value))
  412. ,value)))
  413. (defmacro rxt-return-emacs-regexp (expr)
  414. (let ((value (make-symbol "value")))
  415. `(let ((,value ,expr))
  416. (when (called-interactively-p 'any)
  417. (rxt--kill-emacs-regexp ,value))
  418. ,value)))
  419. (defun rxt--kill-sexp (value)
  420. (let ((lisp-literal (prin1-to-string value)))
  421. (message "%s" lisp-literal)
  422. (kill-new lisp-literal)))
  423. (defun rxt--kill-pcre (value)
  424. (message "%s" value)
  425. (kill-new value))
  426. (defun rxt--kill-emacs-regexp (value)
  427. (let ((lisp-literal (prin1-to-string value)))
  428. (message "%s" value)
  429. (kill-new lisp-literal)
  430. (kill-new value)))
  431. ;; Read an Elisp regexp interactively.
  432. ;;
  433. ;; Three possibilities:
  434. ;;
  435. ;; 1) With a prefix arg, reads literally from the minibuffer, w/o
  436. ;; using string syntax -- just like query-replace-regexp, etc.
  437. ;;
  438. ;; 2) If the region is active, use the text of the region literally
  439. ;; (again w/o string syntax)
  440. ;;
  441. ;; 3) Otherwise, eval the sexp before point (which might be a string
  442. ;; literal or an expression) and use its value. Falls back to method
  443. ;; (1) if this fails to produce a string value.
  444. ;;
  445. (cl-defun rxt-interactive/elisp (&optional (prompt "Emacs regexp: "))
  446. (list
  447. (cond (current-prefix-arg
  448. (read-string prompt))
  449. ((use-region-p)
  450. (buffer-substring-no-properties (region-beginning) (region-end)))
  451. (t
  452. (condition-case nil
  453. (save-excursion
  454. (while (nth 3 (syntax-ppss)) (forward-char))
  455. (let ((re (eval (preceding-sexp))))
  456. (if (stringp re) re
  457. (read-string prompt))))
  458. (error
  459. (read-string prompt)))))))
  460. ;; Read a PCRE regexp interactively.
  461. ;;
  462. ;; Three possibilities: As above, except that without prefix arg or
  463. ;; active region, tries to read a delimited regexp literal like /.../,
  464. ;; m/.../, or qr/.../ following point in the current buffer. Falls
  465. ;; back to reading from minibuffer if that fails.
  466. ;;
  467. ;; Returns the regexp, with flags as text properties.
  468. ;;
  469. ;; TODO: Different delimiters
  470. (cl-defun rxt-interactive/pcre (&optional (prompt "PCRE regexp: "))
  471. (list
  472. (cond (current-prefix-arg
  473. (rxt--read-pcre prompt))
  474. ((use-region-p)
  475. (buffer-substring-no-properties (region-beginning) (region-end)))
  476. (t
  477. (condition-case nil
  478. (rxt-read-delimited-pcre)
  479. (error ; Fall back to reading from minibuffer
  480. (rxt--read-pcre prompt)))))
  481. nil))
  482. (define-minor-mode rxt--read-pcre-mode
  483. "Minor-mode with key-bindings for toggling PCRE flags.
  484. You should not normally call this directly. It will be enabled
  485. in minibuffers for `read-regexp' and in the `re-builder' buffer
  486. when `pcre-mode' is active. These bindings will also be added to
  487. `isearch-mode-map' in `pcre-mode'."
  488. :initial nil
  489. :lighter nil
  490. :keymap
  491. `((,(kbd "C-c s") . ,#'rxt--toggle-s-mode)
  492. (,(kbd "C-c x") . ,#'rxt--toggle-x-mode)
  493. (,(kbd "C-c i") . ,#'rxt--toggle-i-mode)))
  494. (defun rxt--read-pcre (prompt)
  495. "Read a PCRE regexp for translation, together with option flags.
  496. The `s', `x', and `i' flags can be toggled using the following
  497. commands: \\<rxt--read-pcre-mode-map>
  498. \\[rxt--toggle-s-mode] : toggle `s' (single-line) mode
  499. \\[rxt--toggle-x-mode] : toggle `x' (extended) mode
  500. \\[rxt--toggle-i-mode] : toggle `i' (case-insensitive) mode
  501. In single-line mode, `.' will also match newlines.
  502. In extended mode, whitespace is ignored.
  503. Case-insensitive mode emulates matching without case,
  504. independently of Emacs's builtin `case-fold-search' setting.
  505. Note that this does not apply to backreferences."
  506. (minibuffer-with-setup-hook #'rxt--read-pcre-mode
  507. (read-from-minibuffer prompt)))
  508. (defun rxt--toggle-s-mode ()
  509. "Toggle emulated PCRE single-line (s) flag."
  510. (interactive)
  511. (rxt--toggle-flag ?s))
  512. (defun rxt--toggle-x-mode ()
  513. "Toggle emulated PCRE extended (x) flag."
  514. (interactive)
  515. (rxt--toggle-flag ?x))
  516. (defun rxt--toggle-i-mode ()
  517. "Toggle emulated PCRE case-insensitive (i) flag."
  518. (interactive)
  519. (rxt--toggle-flag ?i))
  520. (defun rxt--toggle-flag (char)
  521. "Toggle CHAR, a PCRE flag."
  522. (cond
  523. ((derived-mode-p 'reb-mode) ; RE-Builder
  524. (rxt--toggle-flag-re-builder char))
  525. ((minibufferp)
  526. (rxt--toggle-flag-minibuffer char))
  527. (isearch-mode
  528. (rxt--toggle-flag-isearch char))
  529. (t
  530. (error "Not in minibuffer, RE-Builder or isearch mode."))))
  531. (defun rxt--toggle-flag-re-builder (char)
  532. (save-excursion
  533. (goto-char (point-max))
  534. (search-backward "/")
  535. (forward-char)
  536. (when (looking-at (rx (* (any ?i ?s ?x))))
  537. (let ((inhibit-modification-hooks t))
  538. (replace-match (rxt--xor-flags (match-string 0) char) t t))))
  539. (reb-do-update))
  540. (defun rxt--toggle-flag-minibuffer (char)
  541. (setf (buffer-substring (minibuffer-prompt-end) (point-max))
  542. (rxt--toggle-flag-string (minibuffer-contents) char))
  543. (when
  544. (and (= (point) (minibuffer-prompt-end))
  545. (looking-at (rx "(?" (group (+ (any ?i ?s ?x))) ")")))
  546. (forward-sexp)))
  547. (defun rxt--toggle-flag-isearch (char)
  548. (when isearch-regexp
  549. (setq isearch-string
  550. (rxt--toggle-flag-string isearch-string char))
  551. (setq isearch-message
  552. (mapconcat #'isearch-text-char-description isearch-string ""))
  553. (isearch-search-and-update)))
  554. (defun rxt--toggle-flag-string (string char)
  555. (if (string-match (rx string-start "(?" (group (+ (any ?i ?s ?x))) ")")
  556. string)
  557. (let ((flags (rxt--xor-flags (match-string 1 string) char)))
  558. (if (string= flags "")
  559. (replace-match "" t t string)
  560. (replace-match flags t t string 1)))
  561. (format "(?%c)%s" char string)))
  562. (defun rxt--xor-flags (flags char)
  563. (concat
  564. (sort
  565. (cl-set-exclusive-or (string-to-list flags) (list char))
  566. #'<)))
  567. ;;;; Minor mode for using emulated PCRE syntax
  568. (defvar pcre-old-isearch-search-fun-function nil
  569. "Original value of `isearch-search-fun-function' before entering `pcre-mode.'
  570. This function is wrapped by `pcre-isearch-search-fun-function'
  571. and restored on exit from `pcre-mode'.")
  572. (make-variable-buffer-local 'pcre-old-isearch-search-fun-function)
  573. (defvar pcre-old-isearch-key-bindings nil
  574. "Alist of key-bindings to restore in `isearch-mode-map' on exiting `pcre-mode'.")
  575. ;;;###autoload
  576. (define-minor-mode pcre-mode
  577. "Use emulated PCRE syntax for regexps wherever possible.
  578. Advises the `interactive' specs of `read-regexp' and the
  579. following other functions so that they read PCRE syntax and
  580. translate to its Emacs equivalent:
  581. - `align-regexp'
  582. - `find-tag-regexp'
  583. - `sort-regexp-fields'
  584. - `isearch-message-prefix'
  585. - `ibuffer-do-replace-regexp'
  586. Also alters the behavior of `isearch-mode' when searching by regexp."
  587. nil " PCRE"
  588. nil
  589. :global t
  590. (if pcre-mode
  591. ;; Enabling
  592. (progn
  593. ;; Enable advice
  594. (ad-enable-regexp "pcre-mode")
  595. ;; Set up isearch hooks
  596. (add-hook 'isearch-mode-hook #'pcre-isearch-mode-hook)
  597. (add-hook 'isearch-mode-end-hook #'pcre-isearch-mode-end-hook)
  598. ;; Add the keybindings of `rxt--read-pcre-mode-map' to
  599. ;; `isearch-mode-map' (so that they do not cause an exit from
  600. ;; `isearch-mode'), and save any existing bindings for those
  601. ;; keys to restore on exit from `pcre-mode'.
  602. (setq pcre-old-isearch-key-bindings
  603. (cl-loop for key being the key-seqs of rxt--read-pcre-mode-map
  604. for def = (lookup-key isearch-mode-map key)
  605. collect (cons (copy-sequence key)
  606. (if (numberp def) nil def))))
  607. (cl-loop for key being the key-seqs of rxt--read-pcre-mode-map
  608. using (key-bindings def)
  609. do (define-key isearch-mode-map key def)))
  610. ;; Disable advice
  611. (ad-disable-regexp "pcre-mode")
  612. ;; Remove from isearch hooks
  613. (remove-hook 'isearch-mode-hook #'pcre-isearch-mode-hook)
  614. (remove-hook 'isearch-mode-end-hook #'pcre-isearch-mode-end-hook)
  615. ;; Restore key-bindings
  616. (cl-loop for (key . def) in pcre-old-isearch-key-bindings
  617. do (define-key isearch-mode-map key def)))
  618. ;; "Activating" advice re-computes the function definitions, which
  619. ;; is necessary whether enabling or disabling
  620. (ad-activate-regexp "pcre-mode"))
  621. ;;; Cache of PCRE -> Elisp translations
  622. (defvar pcre-mode-cache-size 100
  623. "Number of PCRE-to-Emacs translations to keep in the `pcre-mode' cache.")
  624. (defvar pcre-mode-cache (make-hash-table :test 'equal)
  625. "Cache of PCRE-to-Emacs translations used in `pcre-mode'.
  626. Keys are PCRE regexps, values are their Emacs equivalents.")
  627. (defvar pcre-mode-reverse-cache (make-hash-table :test 'equal)
  628. "Cache of original PCREs translated to Emacs syntax in `pcre-mode'.
  629. Keys are translated Emacs regexps, values are their original PCRE
  630. form. This is used to display the original PCRE regexp in place
  631. of its translated form.")
  632. (defvar pcre-cache-ring (make-ring pcre-mode-cache-size)
  633. "Ring of PCRE-to-Emacs translations used in `pcre-mode'.
  634. When the ring fills up, the oldest element is removed and the
  635. corresponding entries are deleted from the hash tables
  636. `pcre-mode-cache' and `pcre-mode-reverse-cache'.")
  637. (defun pcre-to-elisp/cached (pcre)
  638. "Translate PCRE to Emacs syntax, caching both forms."
  639. (or (gethash pcre pcre-mode-cache)
  640. (let ((elisp (rxt-pcre-to-elisp pcre)))
  641. (pcre-set-cache pcre elisp)
  642. elisp)))
  643. (defun pcre-set-cache (pcre-regexp emacs-regexp)
  644. "Add a PCRE-to-Emacs translation to the `pcre-mode' cache."
  645. (when (and (not (zerop (length pcre-regexp)))
  646. (not (zerop (length emacs-regexp)))
  647. (not (gethash pcre-regexp pcre-mode-cache)))
  648. (if (= (ring-length pcre-cache-ring) (ring-size pcre-cache-ring))
  649. (let* ((old-item (ring-remove pcre-cache-ring))
  650. (old-pcre (car old-item))
  651. (old-emacs (cdr old-item)))
  652. (remhash old-pcre pcre-mode-cache)
  653. (remhash old-emacs pcre-mode-reverse-cache))
  654. (puthash pcre-regexp emacs-regexp pcre-mode-cache)
  655. (puthash emacs-regexp pcre-regexp pcre-mode-reverse-cache)
  656. (ring-insert pcre-cache-ring (cons pcre-regexp emacs-regexp)))))
  657. ;;; Isearch advice
  658. (defun pcre-isearch-mode-hook ()
  659. (when (not (eq isearch-search-fun-function #'isearch-search-fun-default))
  660. (message "Warning: pcre-mode overriding existing isearch function `%s'"
  661. isearch-search-fun-function))
  662. ;; Prevent an infinite loop, if a previous isearch in pcre-mode
  663. ;; exited without restoring the original search function for some
  664. ;; reason
  665. (unless (eq isearch-search-fun-function #'pcre-isearch-search-fun-function)
  666. (setq pcre-old-isearch-search-fun-function isearch-search-fun-function))
  667. (set (make-local-variable 'isearch-search-fun-function)
  668. #'pcre-isearch-search-fun-function))
  669. (defun pcre-isearch-mode-end-hook ()
  670. (setq isearch-search-fun-function pcre-old-isearch-search-fun-function))
  671. (defun pcre-isearch-search-fun-function ()
  672. "Enable isearching using emulated PCRE syntax.
  673. This is set as the value of `isearch-search-fun-function' when
  674. `pcre-mode' is enabled. Returns a function which searches using
  675. emulated PCRE regexps when `isearch-regexp' is true."
  676. (lambda (string bound noerror)
  677. (let ((real-search-function
  678. (funcall (or pcre-old-isearch-search-fun-function 'isearch-search-fun-default))))
  679. (if (not isearch-regexp)
  680. (funcall real-search-function string bound noerror)
  681. ;; Raise an error if the regexp ends in an incomplete escape
  682. ;; sequence (= odd number of backslashes).
  683. ;; TODO: Perhaps this should really be handled in rxt-pcre-to-elisp?
  684. (if (isearch-backslash string) (rxt-error "Trailing backslash"))
  685. (funcall real-search-function
  686. (pcre-to-elisp/cached string) bound noerror)))))
  687. (defadvice isearch-message-prefix (after pcre-mode disable)
  688. "Add \"PCRE\" to the Isearch message when searching by regexp in `pcre-mode'."
  689. (when (and isearch-regexp
  690. ;; Prevent an inaccurate message if our callback was
  691. ;; removed somehow
  692. (eq isearch-search-fun-function #'pcre-isearch-search-fun-function))
  693. (let ((message ad-return-value))
  694. ;; Some hackery to give replacement the same fontification as
  695. ;; the original
  696. (when
  697. (let ((case-fold-search t)) (string-match "regexp" message))
  698. (let* ((match (match-string 0 message))
  699. (properties (text-properties-at 0 match))
  700. (replacement (apply #'propertize "PCRE regexp" properties))
  701. (new-message (replace-match replacement t t message)))
  702. (setq ad-return-value new-message))))))
  703. (defadvice isearch-fallback
  704. (before pcre-mode (want-backslash &optional allow-invalid to-barrier) disable)
  705. "Hack to fall back correctly in `pcre-mode'. "
  706. ;; A dirty hack to the internals of isearch. Falling back to a
  707. ;; previous match position is necessary when the (Emacs) regexp ends
  708. ;; in "*", "?", "\{" or "\|": this is handled in
  709. ;; `isearch-process-search-char' by calling `isearch-fallback' with
  710. ;; `t' for the value of the first parameter, `want-backslash', in
  711. ;; the last two cases. With PCRE regexps, falling back should take
  712. ;; place on "*", "?", "{" or "|", with no backslashes required.
  713. ;; This advice handles the last two cases by unconditionally setting
  714. ;; `want-backslash' to nil.
  715. (ad-set-arg 0 nil))
  716. (defadvice isearch-edit-string
  717. (around pcre-mode disable)
  718. "Add PCRE mode-toggling keys to Isearch minibuffer in regexp mode."
  719. (if isearch-regexp
  720. (minibuffer-with-setup-hook
  721. #'rxt--read-pcre-mode
  722. ad-do-it)
  723. ad-do-it))
  724. ;;; evil-mode advice
  725. (defadvice evil-search-function
  726. (around pcre-mode (forward regexp-p wrap) disable)
  727. (if (and regexp-p (not isearch-mode))
  728. (let ((real-search-function ad-do-it))
  729. (setq ad-return-value
  730. (pcre-decorate-search-function real-search-function)))
  731. ad-do-it))
  732. (eval-after-load "evil"
  733. '(when pcre-mode
  734. (ad-enable-advice 'evil-search-function 'around 'pcre-mode)
  735. (ad-activate 'evil-search-function)))
  736. (defun pcre-decorate-search-function (real-search-function)
  737. (lambda (string &optional bound noerror count)
  738. (funcall real-search-function
  739. (pcre-to-elisp/cached string)
  740. bound noerror count)))
  741. ;;; Other hooks and defadvices
  742. ;;;###autoload
  743. (defun pcre-query-replace-regexp ()
  744. "Perform `query-replace-regexp' using PCRE syntax.
  745. Consider using `pcre-mode' instead of this function."
  746. (interactive)
  747. (let ((old-pcre-mode pcre-mode))
  748. (unwind-protect
  749. (progn
  750. (pcre-mode +1)
  751. (call-interactively #'query-replace-regexp))
  752. (pcre-mode (if old-pcre-mode 1 0)))))
  753. (defadvice add-to-history
  754. (before pcre-mode (history-var newelt &optional maxelt keep-all) disable)
  755. "Add the original PCRE to query-replace history in `pcre-mode'."
  756. (when (eq history-var query-replace-from-history-variable)
  757. (let ((original (gethash newelt pcre-mode-reverse-cache)))
  758. (when original
  759. (ad-set-arg 1 original)))))
  760. (defadvice query-replace-descr
  761. (before pcre-mode (from) disable)
  762. "Use the original PCRE in Isearch prompts in `pcre-mode'."
  763. (let ((original (gethash from pcre-mode-reverse-cache)))
  764. (when original
  765. (ad-set-arg 0 original))))
  766. ;;; The `interactive' specs of the following functions are lifted
  767. ;;; wholesale from the original built-ins, which see.
  768. (defadvice read-regexp
  769. (around pcre-mode first (prompt &optional defaults history) disable)
  770. "Read regexp using PCRE syntax and convert to Elisp equivalent."
  771. (ad-set-arg 0 (concat "[PCRE] " prompt))
  772. (minibuffer-with-setup-hook
  773. #'rxt--read-pcre-mode
  774. ad-do-it)
  775. (setq ad-return-value
  776. (pcre-to-elisp/cached ad-return-value)))
  777. (defadvice align-regexp
  778. (before pcre-mode first (beg end regexp &optional group spacing repeat) disable)
  779. "Read regexp using PCRE syntax and convert to Elisp equivalent."
  780. (interactive
  781. (append
  782. (list (region-beginning) (region-end))
  783. (if current-prefix-arg
  784. (list (rxt-pcre-to-elisp
  785. (read-string "Complex align using PCRE regexp: "
  786. "(\\s*)"))
  787. (string-to-number
  788. (read-string
  789. "Parenthesis group to modify (justify if negative): " "1"))
  790. (string-to-number
  791. (read-string "Amount of spacing (or column if negative): "
  792. (number-to-string align-default-spacing)))
  793. (y-or-n-p "Repeat throughout line? "))
  794. (list (concat "\\(\\s-*\\)"
  795. (rxt-pcre-to-elisp
  796. (read-string "Align PCRE regexp: ")))
  797. 1 align-default-spacing nil)))))
  798. (defadvice ibuffer-do-replace-regexp
  799. (before pcre-mode first (from-str to-str) disable)
  800. "Read regexp using PCRE syntax and convert to Elisp equivalent."
  801. (interactive
  802. (let* ((from-str (read-from-minibuffer "[PCRE] Replace regexp: "))
  803. (to-str (read-from-minibuffer (concat "[PCRE] Replace " from-str " with: "))))
  804. (list (rxt-pcre-to-elisp from-str) to-str))))
  805. (defadvice find-tag-regexp
  806. (before pcre-mode first (regexp &optional next-p other-window) disable)
  807. "Read regexp using PCRE syntax and convert to Elisp equivalent.
  808. Perform `find-tag-regexp' using emulated PCRE regexp syntax."
  809. (interactive
  810. (let ((args (find-tag-interactive "[PCRE] Find tag regexp: " t)))
  811. (list (rxt-pcre-to-elisp (nth 0 args))
  812. (nth 1 args) (nth 2 args)))))
  813. (defadvice sort-regexp-fields
  814. (before pcre-mode first (reverse record-regexp key-regexp beg end) disable)
  815. "Read regexp using PCRE syntax and convert to Elisp equivalent."
  816. (interactive "P\nsPCRE regexp specifying records to sort: \n\
  817. sPCRE regexp specifying key within record: \nr")
  818. (ad-set-arg 1 (rxt-pcre-to-elisp (ad-get-arg 1)))
  819. (ad-set-arg 2 (rxt-pcre-to-elisp (ad-get-arg 2))))
  820. ;;; Commands that take Emacs-style regexps as input
  821. ;;;###autoload
  822. (defun rxt-elisp-to-pcre (regexp)
  823. "Translate REGEXP, a regexp in Emacs Lisp syntax, to Perl-compatible syntax.
  824. Interactively, reads the regexp in one of three ways. With a
  825. prefix arg, reads from minibuffer without string escaping, like
  826. `query-replace-regexp'. Without a prefix arg, uses the text of
  827. the region if it is active. Otherwise, uses the result of
  828. evaluating the sexp before point (which might be a string regexp
  829. literal or an expression that produces a string).
  830. Displays the translated PCRE regexp in the echo area and copies
  831. it to the kill ring.
  832. Emacs regexp features such as syntax classes which cannot be
  833. translated to PCRE will cause an error."
  834. (interactive (rxt-interactive/elisp))
  835. (rxt-return-pcre (rxt-adt->pcre (rxt-parse-elisp regexp))))
  836. ;;;###autoload
  837. (defun rxt-elisp-to-rx (regexp)
  838. "Translate REGEXP, a regexp in Emacs Lisp syntax, to `rx' syntax.
  839. See `rxt-elisp-to-pcre' for a description of the interactive
  840. behavior and `rx' for documentation of the S-expression based
  841. regexp syntax."
  842. (interactive (rxt-interactive/elisp))
  843. (rxt-return-sexp (rxt-adt->rx (rxt-parse-elisp regexp))))
  844. ;;;###autoload
  845. (defun rxt-elisp-to-strings (regexp)
  846. "Return a list of all strings matched by REGEXP, an Emacs Lisp regexp.
  847. See `rxt-elisp-to-pcre' for a description of the interactive behavior.
  848. This is useful primarily for getting back the original list of
  849. strings from a regexp generated by `regexp-opt', but it will work
  850. with any regexp without unbounded quantifiers (*, +, {2, } and so
  851. on).
  852. Throws an error if REGEXP contains any infinite quantifiers."
  853. (interactive (rxt-interactive/elisp))
  854. (rxt-return-sexp (rxt-adt->strings (rxt-parse-elisp regexp))))
  855. ;;;###autoload
  856. (defun rxt-toggle-elisp-rx ()
  857. "Toggle the regexp near point between Elisp string and rx syntax."
  858. (interactive)
  859. ;; First, position point before the regex form near point (either
  860. ;; a string literal or a list beginning `rx' or `rx-to-string').
  861. (let* ((context (syntax-ppss))
  862. (string-start (nth 8 context)))
  863. (cond (string-start (goto-char string-start))
  864. ((looking-back "\"") (backward-sexp))
  865. ((looking-at "\"") nil)
  866. (t
  867. ;; Search backwards, leaving point in place on error
  868. (goto-char
  869. (save-excursion
  870. (skip-syntax-forward "-")
  871. (while (not (looking-at
  872. (rx "(" (or "rx" "rx-to-string") symbol-end)))
  873. (backward-up-list))
  874. (point))))))
  875. ;; Read and replace the regex following point
  876. (let* ((regex (read (current-buffer)))
  877. (print-escape-newlines t))
  878. (save-excursion
  879. (if (listp regex)
  880. ;; Replace rx form with string value
  881. (prin1 (eval regex) (current-buffer))
  882. ;; Pretty-print rx form
  883. (save-restriction
  884. (let* ((start (point))
  885. (rx-syntax (rxt-elisp-to-rx regex))
  886. (rx-form
  887. (pcase rx-syntax
  888. (`(seq . ,rest) `(rx . ,rest))
  889. (form `(rx ,form)))))
  890. (rxt-print rx-form)
  891. (narrow-to-region start (point)))
  892. (pp-buffer)
  893. ;; remove the extra newline that pp-buffer inserts
  894. (goto-char (point-max))
  895. (delete-region
  896. (point)
  897. (save-excursion (skip-chars-backward " \t\n") (point))))))
  898. (kill-sexp -1)
  899. (indent-pp-sexp)))
  900. ;;; Commands that translate PCRE to other formats
  901. ;;;###autoload
  902. (defun rxt-pcre-to-elisp (pcre &optional flags)
  903. "Translate PCRE, a regexp in Perl-compatible syntax, to Emacs Lisp.
  904. Interactively, uses the contents of the region if it is active,
  905. otherwise reads from the minibuffer. Prints the Emacs translation
  906. in the echo area and copies it to the kill ring.
  907. PCRE regexp features that cannot be translated into Emacs syntax
  908. will cause an error. See the commentary section of pcre2el.el for
  909. more details."
  910. (interactive (rxt-interactive/pcre))
  911. (rxt-return-emacs-regexp
  912. (rx-to-string
  913. (rxt-pcre-to-rx (rxt--add-flags pcre flags))
  914. t)))
  915. ;;;###autoload
  916. (defalias 'pcre-to-elisp 'rxt-pcre-to-elisp)
  917. ;;;###autoload
  918. (defun rxt-pcre-to-rx (pcre &optional flags)
  919. "Translate PCRE, a regexp in Perl-compatible syntax, to `rx' syntax.
  920. See `rxt-pcre-to-elisp' for a description of the interactive behavior."
  921. (interactive (rxt-interactive/pcre))
  922. (rxt-return-sexp (rxt-adt->rx (rxt-parse-pcre (rxt--add-flags pcre flags)))))
  923. ;;;###autoload
  924. (defun rxt-pcre-to-strings (pcre &optional flags)
  925. "Return a list of all strings matched by PCRE, a Perl-compatible regexp.
  926. See `rxt-elisp-to-pcre' for a description of the interactive
  927. behavior and `rxt-elisp-to-strings' for why this might be useful.
  928. Throws an error if PCRE contains any infinite quantifiers."
  929. (interactive (rxt-interactive/pcre))
  930. (rxt-return-sexp (rxt-adt->strings (rxt-parse-pcre (rxt--add-flags pcre flags)))))
  931. (defun rxt--add-flags (pcre flags)
  932. "Prepend FLAGS to PCRE."
  933. (if (not (zerop (length flags)))
  934. (concat "(?" flags ")" pcre)
  935. pcre))
  936. ;;; Regexp explaining functions to display pretty-printed rx syntax
  937. ;; When the `rxt-explain' flag is non-nil, `rxt-adt->rx' records
  938. ;; location information for each element of the generated `rx' form,
  939. ;; allowing highlighting corresponding pieces of syntax at point.
  940. (defvar rxt-explain nil)
  941. (defvar rxt-highlight-overlays nil
  942. "List of active location-highlighting overlays in rxt-help-mode buffer.")
  943. ;;;###autoload
  944. (defun rxt-explain-elisp (regexp)
  945. "Insert the pretty-printed `rx' syntax for REGEXP in a new buffer.
  946. REGEXP is a regular expression in Emacs Lisp syntax. See
  947. `rxt-elisp-to-pcre' for a description of how REGEXP is read
  948. interactively."
  949. (interactive (rxt-interactive/elisp))
  950. (let ((rxt-explain t)
  951. (rxt-verbose-rx-translation rxt-explain-verbosely))
  952. (rxt-pp-rx regexp (rxt-elisp-to-rx regexp))))
  953. ;;;###autoload
  954. (defun rxt-explain-pcre (regexp &optional flags)
  955. "Insert the pretty-printed `rx' syntax for REGEXP in a new buffer.
  956. REGEXP is a regular expression in PCRE syntax. See
  957. `rxt-pcre-to-elisp' for a description of how REGEXP is read
  958. interactively."
  959. (interactive (rxt-interactive/pcre))
  960. (let ((rxt-explain t)
  961. (rxt-verbose-rx-translation rxt-explain-verbosely))
  962. (rxt-pp-rx regexp (rxt-pcre-to-rx regexp flags))))
  963. ;;;###autoload
  964. (defun rxt-quote-pcre (text)
  965. "Return a PCRE regexp which matches TEXT literally.
  966. Any PCRE metacharacters in TEXT will be quoted with a backslash."
  967. (rxt-adt->pcre (rxt-string text)))
  968. ;;;; Commands that depend on the major mode in effect
  969. ;; Macro: interactively call one of two functions depending on the
  970. ;; major mode
  971. (defmacro rxt-mode-dispatch (elisp-function pcre-function)
  972. `(if (memq major-mode '(emacs-lisp-mode lisp-interaction-mode))
  973. (call-interactively #',elisp-function)
  974. (call-interactively #',pcre-function)))
  975. ;;;###autoload
  976. (defun rxt-explain ()
  977. "Pop up a buffer with pretty-printed `rx' syntax for the regex at point.
  978. Chooses regex syntax to read based on current major mode, calling
  979. `rxt-explain-elisp' if buffer is in `emacs-lisp-mode' or
  980. `lisp-interaction-mode', or `rxt-explain-pcre' otherwise."
  981. (interactive)
  982. (rxt-mode-dispatch rxt-explain-elisp rxt-explain-pcre))
  983. ;;;###autoload
  984. (defun rxt-convert-syntax ()
  985. "Convert regex at point to other kind of syntax, depending on major mode.
  986. For buffers in `emacs-lisp-mode' or `lisp-interaction-mode',
  987. calls `rxt-elisp-to-pcre' to convert to PCRE syntax. Otherwise,
  988. calls `rxt-pcre-to-elisp' to convert to Emacs syntax.
  989. The converted syntax is displayed in the echo area and copied to
  990. the kill ring; see the two functions named above for details."
  991. (interactive)
  992. (rxt-mode-dispatch rxt-elisp-to-pcre rxt-pcre-to-elisp))
  993. ;;;###autoload
  994. (defun rxt-convert-to-rx ()
  995. "Convert regex at point to RX syntax. Chooses Emacs or PCRE syntax by major mode."
  996. (interactive)
  997. (rxt-mode-dispatch rxt-elisp-to-rx rxt-pcre-to-rx))
  998. ;;;###autoload
  999. (defun rxt-convert-to-strings ()
  1000. "Convert regex at point to RX syntax. Chooses Emacs or PCRE syntax by major mode."
  1001. (interactive)
  1002. (rxt-mode-dispatch rxt-elisp-to-strings rxt-pcre-to-strings))
  1003. ;;; Minor mode and keybindings
  1004. (defvar rxt-mode-map
  1005. (let ((map (make-sparse-keymap)))
  1006. ;; Generic
  1007. (define-key map (kbd "C-c / /") 'rxt-explain)
  1008. (define-key map (kbd "C-c / c") 'rxt-convert-syntax)
  1009. (define-key map (kbd "C-c / x") 'rxt-convert-to-rx)
  1010. (define-key map (kbd "C-c / '") 'rxt-convert-to-strings)
  1011. ;; From PCRE
  1012. (define-key map (kbd "C-c / p /") 'rxt-explain-pcre)
  1013. (define-key map (kbd "C-c / p e") 'rxt-pcre-to-elisp)
  1014. (define-key map (kbd "C-c / p x") 'rxt-pcre-to-rx)
  1015. (define-key map (kbd "C-c / p '") 'rxt-pcre-to-strings)
  1016. ;; From Elisp
  1017. (define-key map (kbd "C-c / e /") 'rxt-explain-elisp)
  1018. (define-key map (kbd "C-c / e p") 'rxt-elisp-to-pcre)
  1019. (define-key map (kbd "C-c / e x") 'rxt-elisp-to-rx)
  1020. (define-key map (kbd "C-c / e '") 'rxt-elisp-to-strings)
  1021. (define-key map (kbd "C-c / e t") 'rxt-toggle-elisp-rx)
  1022. (define-key map (kbd "C-c / t") 'rxt-toggle-elisp-rx)
  1023. ;; Search
  1024. (define-key map (kbd "C-c / %") 'pcre-query-replace-regexp)
  1025. map)
  1026. "Keymap for `rxt-mode'.")
  1027. ;;;###autoload
  1028. (define-minor-mode rxt-mode
  1029. "Regex translation utilities." nil nil)
  1030. ;;;###autoload
  1031. (defun turn-on-rxt-mode ()
  1032. "Turn on `rxt-mode' in the current buffer."
  1033. (interactive)
  1034. (rxt-mode 1))
  1035. ;;;###autoload
  1036. (define-globalized-minor-mode rxt-global-mode rxt-mode
  1037. turn-on-rxt-mode)
  1038. ;;;; Syntax explanations
  1039. ;; Major mode for displaying pretty-printed S-exp syntax
  1040. (define-derived-mode rxt-help-mode emacs-lisp-mode "Regexp Explain"
  1041. (setq buffer-read-only t)
  1042. (add-hook 'post-command-hook 'rxt-highlight-text nil t)
  1043. (rxt-highlight-text))
  1044. ;; Hack: stop paredit-mode interfering with `rxt-print'
  1045. (eval-when-compile (declare-function paredit-mode "paredit.el"))
  1046. (add-hook 'rxt-help-mode-hook
  1047. (lambda ()
  1048. (if (and (boundp 'paredit-mode)
  1049. paredit-mode)
  1050. (paredit-mode 0))))
  1051. (define-key rxt-help-mode-map "q" 'quit-window)
  1052. (define-key rxt-help-mode-map "z" 'kill-this-buffer)
  1053. (define-key rxt-help-mode-map "n" 'next-line)
  1054. (define-key rxt-help-mode-map "p" 'previous-line)
  1055. (define-key rxt-help-mode-map "f" 'forward-list)
  1056. (define-key rxt-help-mode-map "b" 'backward-list)
  1057. (define-key rxt-help-mode-map "u" 'backward-up-list)
  1058. (define-key rxt-help-mode-map "d" 'down-list)
  1059. (defvar rxt--print-with-overlays nil)
  1060. (defvar rxt--print-depth 0)
  1061. (defconst rxt--print-char-alist
  1062. '((?\a . "\\a")
  1063. (?\b . "\\b")
  1064. (?\t . "\\t")
  1065. (?\n . "\\n")
  1066. (?\v . "\\v")
  1067. (?\f . "\\f")
  1068. (?\r . "\\r")
  1069. (?\e . "\\e")
  1070. (?\s . "\\s")
  1071. (?\\ . "\\\\")
  1072. (?\d . "\\d"))
  1073. "Alist of characters to print using an escape sequence in Elisp source.
  1074. See (info \"(elisp) Basic Char Syntax\").")
  1075. (defconst rxt--whitespace-display-regexp
  1076. (rx-to-string `(any ,@(mapcar #'car rxt--print-char-alist))))
  1077. (defconst rxt--print-special-chars
  1078. '(?\( ?\) ?\\ ?\| ?\; ?\' ?\` ?\" ?\# ?\. ?\,)
  1079. "Characters which require a preceding backslash in Elisp source.
  1080. See (info \"(elisp) Basic Char Syntax\").")
  1081. (defun rxt-pp-rx (regexp rx)
  1082. "Display string regexp REGEXP with its `rx' form RX in an `rxt-help-mode' buffer."
  1083. (with-current-buffer (get-buffer-create "* Regexp Explain *")
  1084. (let ((print-escape-newlines t)
  1085. (inhibit-read-only t))
  1086. (erase-buffer)
  1087. (rxt-help-mode)
  1088. (insert (rxt--propertize-whitespace regexp))
  1089. (newline 2)
  1090. (save-excursion
  1091. (let ((sexp-begin (point))
  1092. (rxt--print-with-overlays t))
  1093. (rxt-print rx)
  1094. (narrow-to-region sexp-begin (point))
  1095. (pp-buffer)
  1096. (widen)))
  1097. (rxt-highlight-text))
  1098. (pop-to-buffer (current-buffer))))
  1099. (cl-defun rxt-print (rx)
  1100. "Insert RX, an `rx' form, into the current buffer, optionally adding overlays.
  1101. Similar to `print' or `prin1', but ensures that `rx' forms are
  1102. printed readably, using character or integer syntax depending on
  1103. context.
  1104. If `rxt--print-with-overlays' is non-nil, also creates overlays linking
  1105. elements of RX to their corresponding locations in the source
  1106. string (see `rxt-explain-elisp', `rxt-explain-pcre' and
  1107. `rxt--make-help-overlays')."
  1108. (let ((start (point)))
  1109. (cl-typecase rx
  1110. (cons
  1111. (pcase rx
  1112. (`(,(and (or `repeat `**) head)
  1113. ,(and (pred integerp) from)
  1114. ,(and (pred integerp) to)
  1115. . ,rest)
  1116. (insert (format "(%s %d %d" head from to))
  1117. (rxt--print-list-tail rest))
  1118. (`(,(and (or `repeat `= `>=) head)
  1119. ,(and (pred integerp) n)
  1120. . ,rest)
  1121. (insert (format "(%s %d" head n))
  1122. (rxt--print-list-tail rest))
  1123. (_
  1124. (rxt--print-list-tail rx t))))
  1125. (symbol
  1126. (cl-case rx
  1127. ;; `print' escapes the ? characters in the rx operators *?
  1128. ;; and +?, but this looks bad and is not strictly necessary:
  1129. ;; (eq (read "*?") (read "*\\?")) => t
  1130. ;; (eq (read "+?") (read "+\\?")) => t
  1131. ((*? +?) (insert (symbol-name rx)))
  1132. (t (prin1 rx (current-buffer)))))
  1133. (string
  1134. (insert (rxt--propertize-whitespace (prin1-to-string rx))))
  1135. (character
  1136. (cond
  1137. ((eq ? rx)
  1138. (insert "?"))
  1139. ((memq rx rxt--print-special-chars)
  1140. (insert "?\\" rx))
  1141. ((assq rx rxt--print-char-alist)
  1142. (insert "?" (assoc-default rx rxt--print-char-alist)))
  1143. (t
  1144. (insert "?" (char-to-string rx)))))
  1145. (t
  1146. (prin1 rx (current-buffer))))
  1147. (when rxt--print-with-overlays
  1148. (rxt--make-help-overlays rx start (point)))))
  1149. (defun rxt--print-list-tail (tail &optional open-paren)
  1150. (let ((rxt--print-depth (1+ rxt--print-depth)))
  1151. (let ((done nil))
  1152. (while (not done)
  1153. (cl-typecase tail
  1154. (null
  1155. (insert ")")
  1156. (setq done t))
  1157. (cons
  1158. (if open-paren
  1159. (progn
  1160. (insert "(")
  1161. (setq open-paren nil))
  1162. (insert " "))
  1163. (rxt-print (car tail))
  1164. (setq tail (cdr tail)))
  1165. (t
  1166. (insert " . ")
  1167. (rxt-print tail)
  1168. (insert ")")
  1169. (setq done t)))))))
  1170. (defun rxt--make-help-overlays (rx start end)
  1171. (let ((location (rxt-location rx)))
  1172. (when (and location
  1173. (rxt-location-start location)
  1174. (rxt-location-end location))
  1175. (let* ((sexp-begin (copy-marker start t))
  1176. (sexp-end (copy-marker end))
  1177. (sexp-bounds (list sexp-begin sexp-end))
  1178. (source-begin (1+ (rxt-location-start location)))
  1179. (source-end (1+ (rxt-location-end location)))
  1180. (source-bounds (list source-begin source-end))
  1181. (bounds (list source-bounds sexp-bounds))
  1182. (sexp-ol (make-overlay sexp-begin sexp-end (current-buffer) t nil))
  1183. (source-ol (make-overlay source-begin source-end (current-buffer) t nil)))
  1184. (dolist (ol (list sexp-ol source-ol))
  1185. (overlay-put ol 'priority rxt--print-depth)
  1186. (overlay-put ol 'rxt-bounds bounds))))))
  1187. (defun rxt--propertize-whitespace (string)
  1188. (let ((string (copy-sequence string))
  1189. (start 0))
  1190. (while (string-match rxt--whitespace-display-regexp string start)
  1191. (put-text-property (match-beginning 0) (match-end 0)
  1192. 'display
  1193. (assoc-default (string-to-char (match-string 0 string))
  1194. rxt--print-char-alist)
  1195. string)
  1196. (setq start (match-end 0)))
  1197. string))
  1198. (defun rxt-highlight-text ()
  1199. "Highlight the regex syntax at point and its corresponding RX/string form."
  1200. (let ((all-bounds (get-char-property (point) 'rxt-bounds)))
  1201. (mapc #'delete-overlay rxt-highlight-overlays)
  1202. (setq rxt-highlight-overlays nil)
  1203. (dolist (bounds all-bounds)
  1204. (cl-destructuring-bind (begin end) bounds
  1205. (let ((overlay (make-overlay begin end)))
  1206. (push overlay rxt-highlight-overlays)
  1207. (overlay-put overlay 'face 'rxt-highlight-face))))))
  1208. ;;;; Error handling
  1209. (if (fboundp 'define-error)
  1210. (define-error 'rxt-invalid-regexp "Invalid regexp" 'invalid-regexp)
  1211. (put 'rxt-invalid-regexp
  1212. 'error-conditions
  1213. '(rxt-invalid-regexp invalid-regexp error))
  1214. (put 'rxt-invalid-regexp 'error-message "Invalid regexp"))
  1215. (defun rxt-error (&rest format-args)
  1216. (signal 'rxt-invalid-regexp (list (apply #'format format-args))))
  1217. ;;;; Regexp syntax tree data type
  1218. ;; Base class from which other elements of the syntax-tree inherit
  1219. (cl-defstruct rxt-syntax-tree)
  1220. ;; Struct representing the original source location
  1221. (cl-defstruct rxt-location
  1222. source ; Either a string or a buffer
  1223. start end ; Offsets, 0- or 1-indexed as appropriate
  1224. )
  1225. (defun rxt-location-text (location)
  1226. (if (not (rxt-location-p location))
  1227. nil
  1228. (let ((start (rxt-location-start location))
  1229. (end (rxt-location-end location))
  1230. (source (rxt-location-source location)))
  1231. (cond
  1232. ((buffer-live-p source)
  1233. (with-current-buffer source
  1234. (buffer-substring-no-properties start end)))
  1235. ((stringp source)
  1236. (substring source start end))
  1237. (t nil)))))
  1238. ;; Hash table mapping from syntax-tree elements to source locations.
  1239. (defvar rxt-location-map (make-hash-table :weakness 'key))
  1240. (defun rxt-location (object)
  1241. (gethash object rxt-location-map))
  1242. (gv-define-setter rxt-location (value object)
  1243. `(puthash ,object ,value rxt-location-map))
  1244. (defun rxt-source-text (object)
  1245. (rxt-location-text (rxt-location object)))
  1246. (defun rxt-to-string (tree)
  1247. "Return a readable representation of TREE, a regex syntax-tree object."
  1248. (or (rxt-source-text tree)
  1249. (let ((print-level 1))
  1250. (prin1-to-string tree))))
  1251. (defalias 'rxt-syntax-tree-readable 'rxt-to-string)
  1252. ;; FIXME
  1253. (defvar rxt-pcre-case-fold nil)
  1254. ;; Literal string
  1255. (cl-defstruct
  1256. (rxt-string
  1257. (:constructor rxt-string (chars &optional case-fold))
  1258. (:include rxt-syntax-tree))
  1259. chars
  1260. (case-fold rxt-pcre-case-fold))
  1261. (defun rxt-empty-string ()
  1262. (rxt-string ""))
  1263. (defun rxt-trivial-p (re)
  1264. (and (rxt-string-p re)
  1265. (equal (rxt-string-chars re) "")))
  1266. ;;; Other primitives
  1267. (cl-defstruct (rxt-primitive
  1268. (:constructor rxt-primitive (pcre rx))
  1269. (:include rxt-syntax-tree))
  1270. pcre rx)
  1271. (defun rxt-bos () (rxt-primitive "\\A" 'bos))
  1272. (defun rxt-eos () (rxt-primitive "\\Z" 'eos))
  1273. (defun rxt-bol () (rxt-primitive "^" 'bol))
  1274. (defun rxt-eol () (rxt-primitive "$" 'eol))
  1275. ;; FIXME
  1276. (defun rxt-anything () (rxt-primitive "." 'anything))
  1277. (defun rxt-nonl () (rxt-primitive "." 'nonl))
  1278. (defun rxt-word-boundary () (rxt-primitive "\\b" 'word-boundary))
  1279. (defun rxt-not-word-boundary () (rxt-primitive "\\B" 'not-word-boundary))
  1280. (defun rxt-wordchar () (rxt-primitive "\\w" 'wordchar))
  1281. (defun rxt-not-wordchar () (rxt-primitive "\\W" 'not-wordchar))
  1282. (defun rxt-symbol-start () (rxt-primitive nil 'symbol-start))
  1283. (defun rxt-symbol-end () (rxt-primitive nil 'symbol-end))
  1284. (defun rxt-bow () (rxt-primitive nil 'bow))
  1285. (defun rxt-eow () (rxt-primitive nil 'eow))
  1286. ;;; Sequence
  1287. (cl-defstruct
  1288. (rxt-seq
  1289. (:constructor make-rxt-seq (elts))
  1290. (:include rxt-syntax-tree))
  1291. elts)
  1292. ;; Slightly smart sequence constructor:
  1293. ;; - Flattens nested sequences
  1294. ;; - Drops trivial "" elements
  1295. ;; - Empty sequence => ""
  1296. ;; - Singleton sequence is reduced to its one element.
  1297. (defun rxt-seq (&rest res) ; Flatten nested seqs & drop ""'s.
  1298. (let ((res (rxt-seq-flatten res)))
  1299. (if (consp res)
  1300. (if (consp (cdr res))
  1301. (make-rxt-seq res) ; General case
  1302. (car res)) ; Singleton sequence
  1303. (rxt-empty-string)))) ; Empty seq -- ""
  1304. (defun rxt-seq-flatten (res)
  1305. (if (consp res)
  1306. (let ((re (car res))
  1307. (tail (rxt-seq-flatten (cdr res))))
  1308. (cond ((rxt-seq-p re) ; Flatten nested seqs
  1309. (append (rxt-seq-flatten (rxt-seq-elts re)) tail))
  1310. ((rxt-trivial-p re) tail) ; Drop trivial elts
  1311. ((and (rxt-string-p re) ; Flatten strings
  1312. (consp tail)
  1313. (rxt-string-p (car tail)))
  1314. (cons
  1315. (rxt-string-concat re (car tail))
  1316. (cdr tail)))
  1317. (t (cons re tail))))
  1318. '()))
  1319. (defun rxt-string-concat (str1 str2)
  1320. (if (not (eq (rxt-string-case-fold str1)
  1321. (rxt-string-case-fold str2)))
  1322. (make-rxt-seq (list str1 str2))
  1323. (let ((result
  1324. (rxt-string (concat (rxt-string-chars str1)
  1325. (rxt-string-chars str2))
  1326. (rxt-string-case-fold str1)))
  1327. (first (rxt-location str1))
  1328. (last (rxt-location str2)))
  1329. (when (and first last)
  1330. (setf (rxt-location result)
  1331. (make-rxt-location :source (rxt-location-source first)
  1332. :start (rxt-location-start first)
  1333. :end (rxt-location-end last))))
  1334. result)))
  1335. ;;; Choice (alternation/union)
  1336. (cl-defstruct
  1337. (rxt-choice
  1338. (:constructor make-rxt-choice (elts))
  1339. (:include rxt-syntax-tree))
  1340. elts)
  1341. ;;; The empty choice represents a regexp that never matches in any context
  1342. (defvar rxt-empty (make-rxt-choice nil))
  1343. (defun rxt-empty-p (re)
  1344. (or
  1345. (and (rxt-choice-p re)
  1346. (null (rxt-choice-elts re)))
  1347. (rxt-empty-char-set-p re)))
  1348. (defun rxt-choice (&rest alternatives)
  1349. "Construct the alternation (union) of several regexps.
  1350. ALTERNATIVES should be a list of `rxt-syntax-tree' objects.
  1351. The return value is an `rxt-choice' object representing a regexp
  1352. which matches any one of ALTERNATIVES, but simplified in the
  1353. following ways:
  1354. - If ALTERNATIVES contains only one element, it is returned unchanged.
  1355. - All existing `rxt-choice' elements in ALTERNATIVES are replaced
  1356. by a flat list of their subexpressions: symbolically,
  1357. a|(b|(c|d)) is replaced by a|b|c|d
  1358. - All character sets and single-character strings in ALTERNATIVES
  1359. are combined together into one or two character sets,
  1360. respecting case-folding behaviour."
  1361. (cl-destructuring-bind (other-elements char-set case-fold-char-set)
  1362. (rxt--simplify-alternatives alternatives)
  1363. (let ((simplified-alternatives
  1364. (append (if (not (rxt-empty-p char-set))
  1365. (list char-set)
  1366. '())
  1367. (if (not (rxt-empty-p case-fold-char-set))
  1368. (list case-fold-char-set)
  1369. '())
  1370. other-elements)))
  1371. (pcase simplified-alternatives
  1372. (`()
  1373. rxt-empty)
  1374. (`(,element)
  1375. element)
  1376. (_
  1377. (make-rxt-choice simplified-alternatives))))))
  1378. (defun rxt--simplify-alternatives (alternatives)
  1379. "Simplify a set of regexp alternatives.
  1380. ALTERNATIVES should be a list of `rxt-syntax-tree' objects to be combined
  1381. into an `rxt-choice' structure. The result is a three-element
  1382. list (OTHER-ELEMENTS CHAR-SET CASE-FOLDED-CHAR-SET):
  1383. - CHAR-SET is an `rxt-char-set-union' containing the union of all
  1384. case-sensitive character sets and single-character strings in
  1385. RES.
  1386. - CASE-FOLDED-CHAR-SET is similar but combines all the
  1387. case-insensitive character sets and single-character strings.
  1388. - OTHER-ELEMENTS is a list of all other elements, with all
  1389. `rxt-choice' structures replaced by a flat list of their
  1390. component subexpressions."
  1391. (if (null alternatives)
  1392. (list '()
  1393. (make-rxt-char-set-union :case-fold nil)
  1394. (make-rxt-char-set-union :case-fold t))
  1395. (let* ((re (car alternatives)))
  1396. (cl-destructuring-bind (tail char-set case-fold-char-set)
  1397. (rxt--simplify-alternatives (cdr alternatives))
  1398. (cond ((rxt-choice-p re) ; Flatten nested choices
  1399. (list
  1400. (append (rxt-choice-elts re) tail)
  1401. char-set
  1402. case-fold-char-set))
  1403. ((rxt-empty-p re) ; Drop empty re's.
  1404. (list tail char-set case-fold-char-set))
  1405. ((rxt-char-set-union-p re) ; Fold char sets together
  1406. (if (rxt-char-set-union-case-fold re)
  1407. (list tail
  1408. char-set
  1409. (rxt-char-set-union case-fold-char-set re))
  1410. (list tail
  1411. (rxt-char-set-union char-set re)
  1412. case-fold-char-set)))
  1413. ((and (rxt-string-p re) ; Same for 1-char strings
  1414. (= 1 (length (rxt-string-chars re))))
  1415. (if (rxt-string-case-fold re)
  1416. (list tail
  1417. char-set
  1418. (rxt-char-set-union case-fold-char-set re))
  1419. (list tail
  1420. (rxt-char-set-union char-set re)
  1421. case-fold-char-set)))
  1422. (t ; Otherwise.
  1423. (list (cons re tail) char-set case-fold-char-set)))))))
  1424. ;;; Repetition
  1425. (cl-defstruct (rxt-repeat
  1426. (:include rxt-syntax-tree))
  1427. from to body greedy)
  1428. (cl-defun rxt-repeat (from to body &optional (greedy t))
  1429. (if (equal to 0)
  1430. (rxt-empty-string)
  1431. (make-rxt-repeat :from from :to to
  1432. :body body :greedy greedy)))
  1433. ;;; Submatch
  1434. (cl-defstruct
  1435. (rxt-submatch
  1436. (:constructor rxt-submatch (body))
  1437. (:include rxt-syntax-tree))
  1438. body)
  1439. ;;; Numbered submatch (Emacs only)
  1440. (cl-defstruct
  1441. (rxt-submatch-numbered
  1442. (:constructor rxt-submatch-numbered (n body))
  1443. (:include rxt-syntax-tree))
  1444. n
  1445. body)
  1446. ;;; Backreference
  1447. (cl-defstruct
  1448. (rxt-backref
  1449. (:constructor rxt-backref (n))
  1450. (:include rxt-syntax-tree))
  1451. n)
  1452. ;;; Syntax classes (Emacs only)
  1453. (cl-defstruct (rxt-syntax-class
  1454. (:include rxt-syntax-tree))
  1455. symbol)
  1456. (defun rxt-syntax-class (symbol)
  1457. (if (assoc symbol rx-syntax)
  1458. (make-rxt-syntax-class :symbol symbol)
  1459. (rxt-error "Invalid syntax class symbol `%s'" symbol)))
  1460. ;;; Character categories (Emacs only)
  1461. (cl-defstruct (rxt-char-category
  1462. (:include rxt-syntax-tree))
  1463. symbol)
  1464. (defun rxt-char-category (symbol)
  1465. (if (assoc symbol rx-categories)
  1466. (make-rxt-char-category :symbol symbol)
  1467. (rxt-error "Invalid character category symbol `%s'" symbol)))
  1468. ;;; Char sets
  1469. ;; <rxt-char-set> ::= <rxt-char-set-union>
  1470. ;; | <rxt-char-set-negation>
  1471. ;; | <rxt-char-set-intersection>
  1472. (cl-defstruct (rxt-char-set (:include rxt-syntax-tree)))
  1473. ;; An rxt-char-set-union represents the union of any number of
  1474. ;; characters, character ranges, and POSIX character classes: anything
  1475. ;; that can be represented in string notation as a class [ ... ]
  1476. ;; without the negation operator.
  1477. (cl-defstruct (rxt-char-set-union
  1478. (:include rxt-char-set))
  1479. chars ; list of single characters
  1480. ranges ; list of ranges (from . to)
  1481. classes ; list of character classes
  1482. (case-fold rxt-pcre-case-fold))
  1483. ;; Test for empty character set
  1484. (defun rxt-empty-char-set-p (cset)
  1485. (and (rxt-char-set-union-p cset)
  1486. (null (rxt-char-set-union-chars cset))
  1487. (null (rxt-char-set-union-ranges cset))
  1488. (null (rxt-char-set-union-classes cset))))
  1489. ;; Simple union constructor
  1490. (defun rxt-char-set-union (&rest items)
  1491. "Construct an regexp character set representing the union of ITEMS.
  1492. Each element of ITEMS may be either: a character; a
  1493. single-character string; a single-character `rxt-string' object;
  1494. a cons, (FROM . TO) representing a range of characters; a symbol,
  1495. representing a named character class; or an `rxt-char-set-union'
  1496. object. All `rxt-char-set-union' objects in ITEMS must have the
  1497. same `case-fold' property."
  1498. (let ((chars '())
  1499. (ranges '())
  1500. (classes '())
  1501. (case-fold 'undetermined))
  1502. (dolist (item items)
  1503. (cl-etypecase item
  1504. (character
  1505. (push item chars))
  1506. (string
  1507. (cl-assert (= 1 (length item)))
  1508. (push (string-to-char item) chars))
  1509. (rxt-string
  1510. (cl-assert (= 1 (length (rxt-string-chars item))))
  1511. (push (string-to-char (rxt-string-chars item)) chars))
  1512. (cons ; range (from . to)
  1513. (cl-check-type (car item) character)
  1514. (cl-check-type (cdr item) character)
  1515. (push item ranges))
  1516. (symbol ; named character class
  1517. (push item classes))
  1518. (rxt-char-set-union
  1519. (if (eq case-fold 'undetermined)
  1520. (setq case-fold (rxt-char-set-union-case-fold item))
  1521. (unless (eq case-fold (rxt-char-set-union-case-fold item))
  1522. (error "Cannot construct union of char-sets with unlike case-fold setting: %S" item)))
  1523. (setq chars (nconc chars (rxt-char-set-union-chars item)))
  1524. (setq ranges (nconc ranges (rxt-char-set-union-ranges item)))
  1525. (setq classes (nconc classes (rxt-char-set-union-classes item))))))
  1526. (make-rxt-char-set-union :chars chars :ranges ranges :classes classes
  1527. :case-fold (if (eq case-fold 'undetermined)
  1528. rxt-pcre-case-fold
  1529. case-fold))))
  1530. (defun rxt--all-char-set-union-chars (char-set)
  1531. "Return a list of all characters in CHAR-SET."
  1532. (cl-assert (rxt-char-set-union-p char-set))
  1533. (append
  1534. (rxt-char-set-union-chars char-set)
  1535. (cl-loop for (start . end) in (rxt-char-set-union-ranges char-set)
  1536. nconc (cl-loop for char from start to end collect char))))
  1537. (defun rxt--simplify-char-set (char-set &optional case-fold-p)
  1538. "Return a minimal char-set to match the same characters as CHAR-SET.
  1539. With optional argument CASE-FOLD-P, return a char-set which
  1540. emulates case-folding behaviour by including both uppercase and
  1541. lowercase versions of all characters in CHAR-SET."
  1542. (cl-assert (rxt-char-set-union-p char-set))
  1543. (let* ((classes (rxt-char-set-union-classes char-set))
  1544. (all-chars
  1545. (if case-fold-p
  1546. (cl-loop for char in (rxt--all-char-set-union-chars char-set)
  1547. nconc (list (upcase char) (downcase char)))
  1548. (rxt--all-char-set-union-chars char-set)))
  1549. (all-ranges
  1550. (rxt--extract-ranges (rxt--remove-redundant-chars all-chars classes))))
  1551. (let ((singletons nil)
  1552. (ranges nil))
  1553. (cl-loop for (start . end) in all-ranges
  1554. do
  1555. (cond ((= start end) (push start singletons))
  1556. ((= (1+ start) end)
  1557. (push start singletons)
  1558. (push end singletons))
  1559. (t (push (cons start end) ranges))))
  1560. (make-rxt-char-set-union :chars (nreverse singletons)
  1561. :ranges (nreverse ranges)
  1562. :classes classes
  1563. :case-fold (if case-fold-p
  1564. nil
  1565. (rxt-char-set-union-case-fold char-set))))))
  1566. (defun rxt--remove-redundant-chars (chars classes)
  1567. "Remove all characters which match a character class in CLASSES from CHARS."
  1568. (if (null classes)
  1569. chars
  1570. (string-to-list
  1571. (replace-regexp-in-string
  1572. (rx-to-string `(any ,@classes))
  1573. ""
  1574. (apply #'string chars)))))
  1575. (defun rxt--extract-ranges (chars)
  1576. "Return a list of all contiguous ranges in CHARS.
  1577. CHARS should be a list of characters (integers). The return
  1578. value is a list of conses (START . END) representing ranges, such
  1579. that the union of all the ranges represents the same of
  1580. characters as CHARS.
  1581. Example:
  1582. (rxt--extract-ranges (list ?a ?b ?c ?q ?x ?y ?z))
  1583. => ((?a . ?c) (?q . ?q) (?x . ?z))"
  1584. (let ((array
  1585. (apply #'vector
  1586. (cl-remove-duplicates
  1587. (sort (copy-sequence chars) #'<)))))
  1588. (cl-labels
  1589. ((recur (start end)
  1590. (if (< end start)
  1591. nil
  1592. (let ((min (aref array start))
  1593. (max (aref array end)))
  1594. (if (= (- max min) (- end start))
  1595. (list (cons min max))
  1596. (let* ((split-point (/ (+ start end) 2))
  1597. (left (recur start split-point))
  1598. (right (recur (1+ split-point) end)))
  1599. (merge left right))))))
  1600. (merge (left right)
  1601. (cond ((null left) right)
  1602. ((null right) left)
  1603. (t
  1604. (let ((last-left (car (last left)))
  1605. (first-right (car right)))
  1606. (if (= (1+ (cdr last-left))
  1607. (car first-right))
  1608. (append (cl-subseq left 0 -1)
  1609. (list
  1610. (cons (car last-left)
  1611. (cdr first-right)))
  1612. (cl-subseq right 1))
  1613. (append left right)))))))
  1614. (recur 0 (1- (length array))))))
  1615. ;;; Set complement of character set, syntax class, or character
  1616. ;;; category
  1617. ;; In general, all character sets that can be represented in string
  1618. ;; notation as [^ ... ] (but see `rxt-char-set-intersection', below), plus
  1619. ;; Emacs' \Sx and \Cx constructions.
  1620. (cl-defstruct (rxt-char-set-negation
  1621. (:include rxt-char-set))
  1622. elt)
  1623. (defun rxt-negate (char-set)
  1624. "Construct the logical complement (negation) of CHAR-SET.
  1625. CHAR-SET may be any of the following types: `rxt-char-set-union',
  1626. `rxt-syntax-class', `rxt-char-category', or `rxt-char-set-negation'."
  1627. (cl-etypecase char-set
  1628. ((or rxt-char-set-union rxt-syntax-class rxt-char-category)
  1629. (make-rxt-char-set-negation :elt char-set))
  1630. (rxt-char-set-negation
  1631. (rxt-char-set-negation-elt char-set))))
  1632. ;;; Intersections of char sets
  1633. ;; These are difficult to represent in general, but can be constructed
  1634. ;; in Perl using double negation; for example: [^\Wabc] means the set
  1635. ;; complement of [abc] with respect to the universe of "word
  1636. ;; characters": (& (~ (~ word)) (~ ("abc"))) == (& word (~ ("abc")))
  1637. ;; == (- word ("abc"))
  1638. (cl-defstruct (rxt-char-set-intersection
  1639. (:include rxt-char-set))
  1640. elts)
  1641. ;; Intersection constructor
  1642. (defun rxt-char-set-intersection (&rest charsets)
  1643. (let ((elts '())
  1644. (cmpl (make-rxt-char-set-union)))
  1645. (dolist (cset (rxt-int-flatten charsets))
  1646. (cond
  1647. ((rxt-char-set-negation-p cset)
  1648. ;; Fold negated charsets together: ~A & ~B = ~(A|B)
  1649. (setq cmpl (rxt-char-set-union cmpl (rxt-char-set-negation-elt cset))))
  1650. ((rxt-char-set-union-p cset)
  1651. (push cset elts))
  1652. (t
  1653. (rxt-error "Can't take intersection of non-character-set %S" cset))))
  1654. (if (null elts)
  1655. (rxt-negate cmpl)
  1656. (unless (rxt-empty-char-set-p cmpl)
  1657. (push (rxt-negate cmpl) elts))
  1658. (if (null (cdr elts))
  1659. (car elts) ; singleton case
  1660. (make-rxt-char-set-intersection :elts elts)))))
  1661. ;; Constructor helper: flatten nested intersections
  1662. (defun rxt-int-flatten (csets)
  1663. (if (consp csets)
  1664. (let ((cset (car csets))
  1665. (tail (rxt-int-flatten (cdr csets))))
  1666. (if (rxt-char-set-intersection-p cset)
  1667. (append (rxt-int-flatten (rxt-char-set-intersection-elts cset)) tail)
  1668. (cons cset tail)))
  1669. '()))
  1670. ;;;; Macros for building the parser
  1671. (defmacro rxt-token-case (&rest cases)
  1672. "Consume a token at point and evaluate corresponding forms.
  1673. CASES is a list of `cond'-like clauses, (REGEXP BODY ...) where
  1674. the REGEXPs define possible tokens which may appear at point. The
  1675. CASES are considered in order. For each case, if the text at
  1676. point matches REGEXP, then point is moved to the end of the
  1677. matched token, the corresponding BODY is evaluated and their
  1678. value returned. The matched token is available within the BODY
  1679. forms as (match-string 0).
  1680. There can be a default case where REGEXP is `t', which evaluates
  1681. the corresponding FORMS but does not move point.
  1682. Returns `nil' if none of the CASES matches."
  1683. (declare (debug (&rest (sexp &rest form))))
  1684. `(cond
  1685. ,@(cl-loop for (token . action) in cases
  1686. collect
  1687. (if (eq token t)
  1688. `(t ,@action)
  1689. `((looking-at ,token)
  1690. (goto-char (match-end 0))
  1691. ,@action)))))
  1692. (defmacro rxt-with-source-location (&rest body)
  1693. "Evaluate BODY and record source location information on its value.
  1694. BODY may evaluate to any kind of object, but its value should
  1695. generally not be `eq' to any other object."
  1696. (declare (debug (&rest form)))
  1697. (let ((begin (make-symbol "begin"))
  1698. (value (make-symbol "value")))
  1699. `(let ((,begin (point))
  1700. (,value ,(macroexp-progn body)))
  1701. (setf (rxt-location ,value)
  1702. (make-rxt-location :source rxt-source-text-string
  1703. :start (1- ,begin)
  1704. :end (1- (point))))
  1705. ,value)))
  1706. ;; Read PCRE + flags
  1707. (defun rxt-read-delimited-pcre ()
  1708. "Read a Perl-style delimited regexp and flags from the current buffer.
  1709. Point should be before the regexp literal before calling
  1710. this. Currently only regexps delimited by / ... / are supported.
  1711. A preceding \"m\", \"qr\" or \"s\" will be ignored, as will the
  1712. replacement string in an s/.../.../ construction.
  1713. Returns two strings: the regexp and the flags."
  1714. (save-excursion
  1715. (skip-syntax-forward "-")
  1716. ;; Skip m, qr, s
  1717. (let ((is-subst (rxt-token-case
  1718. ("s" t)
  1719. ((rx (or "m" "qr")) nil))))
  1720. (when (not (looking-at "/"))
  1721. (error "Only Perl regexps delimited by slashes are supported"))
  1722. (let ((beg (match-end 0))
  1723. (delim (rx (not (any "\\"))
  1724. (group "/"))))
  1725. (search-forward-regexp delim)
  1726. (let ((end (match-beginning 1)))
  1727. (when is-subst (search-forward-regexp delim))
  1728. (let ((pcre (buffer-substring-no-properties beg end)))
  1729. (rxt-token-case
  1730. ("[gimosx]*"
  1731. (rxt--add-flags pcre (match-string-no-properties 0))))))))))
  1732. ;;;; Elisp and PCRE string notation parser
  1733. ;;; Parser constants
  1734. (defconst rxt-pcre-char-set-alist
  1735. `((?w . ; "word" characters
  1736. (?_ alnum))
  1737. (?d . ; digits
  1738. (digit))
  1739. (?h . ; horizontal whitespace
  1740. (#x0009 #x0020 #x00A0 #x1680 #x180E #x2000 #x2001 #x2002 #x2003
  1741. #x2004 #x2005 #x2006 #x2007 #x2008 #x2009 #x200A #x202F
  1742. #x205F #x3000))
  1743. (?s . ; whitespace
  1744. (9 10 12 13 32))
  1745. (?v . ; vertical whitespace
  1746. (#x000A #x000B #x000C #x000D #x0085 #x2028 #x2029))))
  1747. (defconst rxt-pcre-named-classes-regexp
  1748. (rx "[:"
  1749. (submatch
  1750. (or "alnum" "alpha" "ascii" "blank" "cntrl" "digit" "graph" "lower"
  1751. "print" "punct" "space" "upper" "word" "xdigit"))
  1752. ":]"))
  1753. (defconst rxt-elisp-named-classes-regexp
  1754. (rx "[:"
  1755. (submatch
  1756. (or "alnum" "alpha" "ascii" "blank" "cntrl" "digit" "graph" "lower"
  1757. "print" "punct" "space" "upper" "word" "xdigit"
  1758. "unibyte" "nonascii" "multibyte"))
  1759. ":]"))
  1760. ;;; The following dynamically bound variables control the operation of
  1761. ;;; the parser (see `rxt-parse-re'.)
  1762. (defvar rxt-parse-pcre nil
  1763. "t if the rxt string parser is parsing PCRE syntax, nil for Elisp syntax.
  1764. This should only be let-bound internally, never set otherwise.")
  1765. (defvar rxt-pcre-extended-mode nil
  1766. "t if the rxt string parser is emulating PCRE's \"extended\" mode.
  1767. In extended mode (indicated by /x in Perl/PCRE), whitespace
  1768. outside of character classes and \\Q...\\E quoting is ignored,
  1769. and a `#' character introduces a comment that extends to the end
  1770. of line.")
  1771. (defvar rxt-pcre-s-mode nil
  1772. "t if the rxt string parser is emulating PCRE's single-line \"/s\" mode.
  1773. When /s is used, PCRE's \".\" matches newline characters, which
  1774. otherwise it would not match.")
  1775. (defvar rxt-pcre-case-fold nil
  1776. "non-nil to emulate PCRE's case-insensitive \"/i\" mode in translated regexps.")
  1777. (defvar rxt-branch-end-regexp nil)
  1778. (defvar rxt-choice-regexp nil)
  1779. (defvar rxt-brace-begin-regexp nil)
  1780. (defvar rxt-m-to-n-brace-regexp nil)
  1781. (defvar rxt-m-to-?-brace-regexp nil)
  1782. (defvar rxt-m-brace-regexp nil)
  1783. (defvar rxt-named-classes-regexp nil)
  1784. (defvar rxt-subgroup-count nil)
  1785. (defvar rxt-source-text-string nil)
  1786. (defun rxt-parse-pcre (re)
  1787. (rxt-parse-re re t))
  1788. (defun rxt-parse-elisp (re)
  1789. (rxt-parse-re re nil))
  1790. (defun rxt-parse-re (re pcre-p)
  1791. (let* ((rxt-parse-pcre pcre-p)
  1792. (rxt-pcre-extended-mode nil)
  1793. (rxt-pcre-s-mode nil)
  1794. (rxt-pcre-case-fold nil)
  1795. ;; Bind regexps to match syntax that differs between PCRE and
  1796. ;; Elisp only in the addition of a backslash "\"
  1797. (escape (if pcre-p "" "\\"))
  1798. (rxt-choice-regexp
  1799. (rx-to-string `(seq ,escape "|")))
  1800. (rxt-branch-end-regexp
  1801. (rx-to-string `(or buffer-end
  1802. (seq ,escape (or "|" ")")))))
  1803. (rxt-brace-begin-regexp
  1804. (rx-to-string `(seq ,escape "{")))
  1805. (rxt-m-to-n-brace-regexp
  1806. (rx-to-string
  1807. `(seq
  1808. (submatch (* (any "0-9"))) "," (submatch (+ (any "0-9")))
  1809. ,escape "}")))
  1810. (rxt-m-to-?-brace-regexp
  1811. (rx-to-string
  1812. `(seq (submatch (+ (any "0-9"))) "," ,escape "}")))
  1813. (rxt-m-brace-regexp
  1814. (rx-to-string
  1815. `(seq (submatch (+ (any "0-9"))) ,escape "}")))
  1816. ;; Named character classes [: ... :] differ slightly
  1817. (rxt-named-classes-regexp
  1818. (if pcre-p
  1819. rxt-pcre-named-classes-regexp
  1820. rxt-elisp-named-classes-regexp))
  1821. (rxt-subgroup-count 0)
  1822. (case-fold-search nil))
  1823. (with-temp-buffer
  1824. (insert re)
  1825. (goto-char (point-min))
  1826. (let ((rxt-source-text-string re))
  1827. (rxt-parse-exp)))))
  1828. ;; Parse a complete regex: a number of branches separated by | or
  1829. ;; \|, as determined by `rxt-branch-end-regexp'.
  1830. (defun rxt-parse-exp ()
  1831. ;; These variables are let-bound here because in PCRE mode they may
  1832. ;; be set internally by (?x) or (?s) constructions, whose scope
  1833. ;; lasts until the end of a sub-expression
  1834. (rxt-with-source-location
  1835. (let ((rxt-pcre-extended-mode rxt-pcre-extended-mode)
  1836. (rxt-pcre-s-mode rxt-pcre-s-mode)
  1837. (rxt-pcre-case-fold rxt-pcre-case-fold))
  1838. (if (eobp)
  1839. (rxt-seq)
  1840. (let ((branches '()))
  1841. (cl-block nil
  1842. (while t
  1843. (let ((branch (rxt-parse-branch)))
  1844. (push branch branches)
  1845. (rxt-token-case
  1846. (rxt-choice-regexp nil)
  1847. (t (cl-return (apply #'rxt-choice (reverse branches)))))))))))))
  1848. ;; Skip over whitespace and comments in PCRE extended mode
  1849. (defun rxt-extended-skip ()
  1850. (when rxt-pcre-extended-mode
  1851. (skip-syntax-forward "-")
  1852. (while (looking-at "#")
  1853. (beginning-of-line 2)
  1854. (skip-syntax-forward "-"))))
  1855. ;; Parse a regexp "branch": a sequence of pieces
  1856. (defun rxt-parse-branch ()
  1857. (rxt-extended-skip)
  1858. (rxt-with-source-location
  1859. (let ((pieces '())
  1860. (branch-start-p t))
  1861. (while (not (looking-at rxt-branch-end-regexp))
  1862. (push (rxt-parse-piece branch-start-p) pieces)
  1863. (setq branch-start-p nil))
  1864. (apply #'rxt-seq (reverse pieces)))))
  1865. ;; Parse a regexp "piece": an atom (`rxt-parse-atom') plus any
  1866. ;; following quantifiers
  1867. (defun rxt-parse-piece (&optional branch-begin)
  1868. (rxt-extended-skip)
  1869. (rxt-with-source-location
  1870. (let ((atom (rxt-parse-atom branch-begin)))
  1871. (rxt-parse-quantifiers atom))))
  1872. ;; Parse any and all quantifiers after ATOM and return the quantified
  1873. ;; regexp, or ATOM unchanged if no quantifiers
  1874. (defun rxt-parse-quantifiers (atom)
  1875. (catch 'done
  1876. (while (not (eobp))
  1877. (let ((atom1 (rxt-parse-quantifier atom)))
  1878. (if (eq atom1 atom)
  1879. (throw 'done t)
  1880. (setq atom atom1)))))
  1881. atom)
  1882. ;; Possibly parse a single quantifier after ATOM and return the
  1883. ;; quantified atom, or ATOM if no quantifier
  1884. (defun rxt-parse-quantifier (atom)
  1885. (rxt-extended-skip)
  1886. (rxt-token-case
  1887. ((rx "*?") (rxt-repeat 0 nil atom nil))
  1888. ((rx "*") (rxt-repeat 0 nil atom t))
  1889. ((rx "+?") (rxt-repeat 1 nil atom nil))
  1890. ((rx "+") (rxt-repeat 1 nil atom t))
  1891. ((rx "??") (rxt-repeat 0 1 atom nil))
  1892. ((rx "?") (rxt-repeat 0 1 atom t))
  1893. ;; Brace expression "{M,N}", "{M,}", "{M}"
  1894. (rxt-brace-begin-regexp
  1895. (cl-destructuring-bind (from to)
  1896. (rxt-parse-braces)
  1897. (rxt-repeat from to atom)))
  1898. ;; No quantifiers found
  1899. (t atom)))
  1900. ;; Parse a regexp atom, i.e. an element that binds to any following
  1901. ;; quantifiers. This includes characters, character classes,
  1902. ;; parenthesized groups, assertions, etc.
  1903. (defun rxt-parse-atom (&optional branch-begin)
  1904. (if (eobp)
  1905. (rxt-error "Unexpected end of regular expression")
  1906. (if rxt-parse-pcre
  1907. (rxt-parse-atom/pcre)
  1908. (rxt-parse-atom/el branch-begin))))
  1909. (defun rxt-parse-atom/common ()
  1910. (rxt-token-case
  1911. ((rx "[") (rxt-parse-char-class))
  1912. ((rx "\\b") (rxt-word-boundary))
  1913. ((rx "\\B") (rxt-not-word-boundary))))
  1914. (defun rxt-parse-atom/el (branch-begin)
  1915. (rxt-with-source-location
  1916. (or (rxt-parse-atom/common)
  1917. (rxt-token-case
  1918. ;; "." wildcard
  1919. ((rx ".") (rxt-nonl))
  1920. ;; "^" and "$" are metacharacters only at beginning or end of a
  1921. ;; branch in Elisp; elsewhere they are literals
  1922. ((rx "^")
  1923. (if branch-begin
  1924. (rxt-bol)
  1925. (rxt-string "^")))
  1926. ((rx "$")
  1927. (if (looking-at rxt-branch-end-regexp)
  1928. (rxt-eol)
  1929. (rxt-string "$")))
  1930. ;; Beginning & end of string, word, symbol
  1931. ((rx "\\`") (rxt-bos))
  1932. ((rx "\\'") (rxt-eos))
  1933. ((rx "\\<") (rxt-bow))
  1934. ((rx "\\>") (rxt-eow))
  1935. ((rx "\\_<") (rxt-symbol-start))
  1936. ((rx "\\_>") (rxt-symbol-end))
  1937. ;; Subgroup
  1938. ((rx "\\(") (rxt-parse-subgroup/el))
  1939. ;; Word/non-word characters (meaning depending on syntax table)
  1940. ((rx "\\w") (rxt-wordchar))
  1941. ((rx "\\W") (rxt-not-wordchar))
  1942. ;; Other syntax categories
  1943. ((rx "\\" (submatch (any ?S ?s)) (submatch nonl))
  1944. (let ((negated (string= (match-string 1) "S"))
  1945. (syntax
  1946. (car (rassoc (string-to-char (match-string 2))
  1947. rx-syntax))))
  1948. (if syntax
  1949. (let ((re (rxt-syntax-class syntax)))
  1950. (if negated (rxt-negate re) re))
  1951. (rxt-error "Invalid syntax class `\\%s'" (match-string 0)))))
  1952. ;; Character categories
  1953. ((rx "\\" (submatch (any ?C ?c)) (submatch nonl))
  1954. (let ((negated (string= (match-string 1) "C"))
  1955. (category
  1956. (car (rassoc (string-to-char (match-string 2))
  1957. rx-categories))))
  1958. (if category
  1959. (let ((re (rxt-char-category category)))
  1960. (if negated (rxt-negate re) re))
  1961. (rxt-error "Invalid character category `%s'" (match-string 0)))))
  1962. ;; Backreference
  1963. ((rx (seq "\\" (submatch (any "1-9"))))
  1964. (rxt-backref (string-to-number (match-string 1))))
  1965. ;; Other escaped characters
  1966. ((rx (seq "\\" (submatch nonl)))
  1967. (rxt-string (match-string 1)))
  1968. ;; Normal characters
  1969. ((rx (or "\n" nonl))
  1970. (rxt-string (match-string 0)))))))
  1971. (defun rxt-parse-atom/pcre ()
  1972. (rxt-extended-skip)
  1973. (rxt-with-source-location
  1974. (or
  1975. ;; Is it an atom that's the same in Elisp?
  1976. (rxt-parse-atom/common)
  1977. ;; Is it common to PCRE regex and character class syntax?
  1978. (let ((char (rxt-parse-escapes/pcre)))
  1979. (and char
  1980. (rxt-string (char-to-string char))))
  1981. ;; Otherwise:
  1982. (rxt-token-case
  1983. ;; "." wildcard
  1984. ((rx ".")
  1985. (if rxt-pcre-s-mode
  1986. (rxt-anything)
  1987. (rxt-nonl)))
  1988. ;; Beginning & end of string/line
  1989. ((rx "^") (rxt-bol))
  1990. ((rx "$") (rxt-eol))
  1991. ((rx "\\A") (rxt-bos))
  1992. ((rx "\\Z") (rxt-eos))
  1993. ;; Subgroup
  1994. ((rx "(") (rxt-parse-subgroup/pcre))
  1995. ;; Metacharacter quoting
  1996. ((rx "\\Q")
  1997. ;; It would seem simple to take all the characters between \Q
  1998. ;; and \E and make an rxt-string, but \Q...\E isn't an atom:
  1999. ;; any quantifiers afterward should bind only to the last
  2000. ;; character, not the whole string.
  2001. (let ((begin (point)))
  2002. (search-forward "\\E" nil t)
  2003. (let* ((end (match-beginning 0))
  2004. (str (buffer-substring-no-properties begin (1- end)))
  2005. (char (char-to-string (char-before end))))
  2006. (rxt-seq (rxt-string str)
  2007. (rxt-parse-quantifiers (rxt-string char))))))
  2008. ;; Pre-defined character sets
  2009. ((rx "\\" (submatch (any "d" "D" "h" "H" "s" "S" "v" "V" "w" "W")))
  2010. (rxt--pcre-char-set (string-to-char (match-string 1))))
  2011. ;; \ + digits: backreference or octal char?
  2012. ((rx "\\" (submatch (+ (any "0-9"))))
  2013. (let* ((digits (match-string 1))
  2014. (dec (string-to-number digits)))
  2015. ;; from "man pcrepattern": If the number is less than 10, or if
  2016. ;; there have been at least that many previous capturing left
  2017. ;; parentheses in the expression, the entire sequence is taken
  2018. ;; as a back reference.
  2019. (if (and (> dec 0)
  2020. (or (< dec 10)
  2021. (>= rxt-subgroup-count dec)))
  2022. (progn
  2023. (when rxt-pcre-case-fold
  2024. (display-warning
  2025. 'rxt "Backreferences with case-folding are handled poorly"))
  2026. (rxt-backref dec))
  2027. ;; from "man pcrepattern": if the decimal number is greater
  2028. ;; than 9 and there have not been that many capturing
  2029. ;; subpatterns, PCRE re-reads up to three octal digits
  2030. ;; following the backslash, and uses them to generate a data
  2031. ;; character. Any subsequent digits stand for themselves.
  2032. (goto-char (match-beginning 1))
  2033. (re-search-forward (rx (** 0 3 (any "0-7"))))
  2034. (rxt-string (char-to-string (string-to-number (match-string 0) 8))))))
  2035. ;; Other escaped characters
  2036. ((rx "\\" (submatch nonl)) (rxt-string (match-string 1)))
  2037. ;; Everything else
  2038. ((rx (or (any "\n") nonl)) (rxt-string (match-string 0)))))))
  2039. (defun rxt-parse-escapes/pcre ()
  2040. "Consume a one-char PCRE escape at point and return its codepoint equivalent.
  2041. Handles only those character escapes which have the same meaning
  2042. in character classes as outside them."
  2043. (rxt-token-case
  2044. ((rx "\\a") #x07) ; bell
  2045. ((rx "\\e") #x1b) ; escape
  2046. ((rx "\\f") #x0c) ; formfeed
  2047. ((rx "\\n") #x0a) ; linefeed
  2048. ((rx "\\r") #x0d) ; carriage return
  2049. ((rx "\\t") #x09) ; tab
  2050. ;; Control character
  2051. ((rx "\\c" (submatch nonl))
  2052. ;; from `man pcrepattern':
  2053. ;; The precise effect of \cx is as follows: if x is a lower case
  2054. ;; letter, it is converted to upper case. Then bit 6 of the
  2055. ;; character (hex 40) is inverted.
  2056. (logxor (string-to-char (upcase (match-string 1))) #x40))
  2057. ;; Hex escapes
  2058. ((rx "\\x" (submatch (** 1 2 (any "0-9" "A-Z" "a-z"))))
  2059. (string-to-number (match-string 1) 16))
  2060. ((rx "\\x{" (submatch (* (any "0-9" "A-Z" "a-z"))) "}")
  2061. (string-to-number (match-string 1) 16))))
  2062. (defun rxt-parse-subgroup/pcre ()
  2063. (catch 'return
  2064. (let ((shy nil)
  2065. (extended-mode rxt-pcre-extended-mode)
  2066. (single-line-mode rxt-pcre-s-mode)
  2067. (case-fold rxt-pcre-case-fold))
  2068. (rxt-extended-skip)
  2069. ;; Check for special (? ..) and (* ...) syntax
  2070. (rxt-token-case
  2071. ((rx "?") ; (?
  2072. (rxt-token-case
  2073. ((rx ")") ; Empty group (?)
  2074. (throw 'return (rxt-empty-string)))
  2075. (":" (setq shy t)) ; Shy group (?:
  2076. ("#" ; Comment (?#
  2077. (search-forward ")")
  2078. (throw 'return (rxt-empty-string)))
  2079. ((rx (or ; Flags (?isx-isx
  2080. (seq (group (* (any "gimosx"))) "-" (group (+ (any "gimosx"))))
  2081. (seq (group (+ (any "gimosx"))))))
  2082. (let ((token (match-string 0))
  2083. (on (or (match-string 1) (match-string 3)))
  2084. (off (or (match-string 2) "")))
  2085. (if (cl-find ?x on) (setq extended-mode t))
  2086. (if (cl-find ?s on) (setq single-line-mode t))
  2087. (if (cl-find ?i on) (setq case-fold t))
  2088. (if (cl-find ?x off) (setq extended-mode nil))
  2089. (if (cl-find ?s off) (setq single-line-mode nil))
  2090. (if (cl-find ?i off) (setq case-fold nil))
  2091. (when (string-match-p "[gmo]" token)
  2092. (display-warning
  2093. 'rxt (format "Unhandled PCRE flags in (?%s" token))))
  2094. (rxt-token-case
  2095. (":" (setq shy t)) ; Shy group with flags (?isx-isx: ...
  2096. (")" ; Set flags (?isx-isx)
  2097. ;; Set flags for the remainder of the current subexpression
  2098. (setq rxt-pcre-extended-mode extended-mode
  2099. rxt-pcre-s-mode single-line-mode
  2100. rxt-pcre-case-fold case-fold)
  2101. (throw 'return (rxt-empty-string)))))
  2102. ;; Other constructions like (?=, (?!, etc. are not recognised
  2103. (t (rxt-error "Unrecognized PCRE extended construction `(?%c'"
  2104. (char-after)))))
  2105. ;; No special (* ...) verbs are recognised
  2106. ((rx "*")
  2107. (let ((begin (point)))
  2108. (search-forward ")" nil 'go-to-end)
  2109. (rxt-error "Unrecognized PCRE extended construction `(*%s'"
  2110. (buffer-substring begin (point))))))
  2111. ;; Parse the remainder of the subgroup
  2112. (unless shy (cl-incf rxt-subgroup-count))
  2113. (let* ((rxt-pcre-extended-mode extended-mode)
  2114. (rxt-pcre-s-mode single-line-mode)
  2115. (rxt-pcre-case-fold case-fold)
  2116. (rx (rxt-parse-exp)))
  2117. (rxt-extended-skip)
  2118. (rxt-token-case
  2119. (")" (if shy rx (rxt-submatch rx)))
  2120. (t (rxt-error "Subexpression missing close paren")))))))
  2121. (defun rxt-parse-subgroup/el ()
  2122. (let ((kind
  2123. (rxt-token-case
  2124. ((rx "?:")
  2125. (cl-incf rxt-subgroup-count)
  2126. 'shy)
  2127. ((rx "?" (group (+ (in "0-9"))) ":")
  2128. (let ((n (string-to-number (match-string 1))))
  2129. (when (< rxt-subgroup-count n)
  2130. (setf rxt-subgroup-count n))
  2131. n))
  2132. ((rx "?") ; Reserved
  2133. (rxt-error "Unknown match group sequence")))))
  2134. (let ((rx (rxt-parse-exp)))
  2135. (rxt-token-case
  2136. ((rx "\\)")
  2137. (cond ((eq kind 'shy) rx)
  2138. ((numberp kind)
  2139. (rxt-submatch-numbered kind rx))
  2140. (t (rxt-submatch rx))))
  2141. (t (rxt-error "Subexpression missing close paren"))))))
  2142. (defun rxt-parse-braces ()
  2143. (rxt-token-case
  2144. (rxt-m-to-n-brace-regexp
  2145. (list (string-to-number (match-string 1))
  2146. (string-to-number (match-string 2))))
  2147. (rxt-m-to-?-brace-regexp
  2148. (list (string-to-number (match-string 1)) nil))
  2149. (rxt-m-brace-regexp
  2150. (let ((a (string-to-number (match-string 1))))
  2151. (list a a)))
  2152. (t
  2153. (let ((begin (point)))
  2154. (search-forward "}" nil 'go-to-end)
  2155. (rxt-error "Bad brace expression {%s"
  2156. (buffer-substring-no-properties begin (point)))))))
  2157. ;; Parse a character set range [...]
  2158. (defun rxt-parse-char-class ()
  2159. (when (eobp)
  2160. (rxt-error "Missing close right bracket in regexp"))
  2161. (rxt-with-source-location
  2162. (let* ((negated (rxt-token-case
  2163. ((rx "^") t)
  2164. (t nil)))
  2165. (begin (point))
  2166. (result
  2167. (if negated
  2168. (rxt-negate (rxt-char-set-union))
  2169. (rxt-char-set-union)))
  2170. (transformer
  2171. (if negated #'rxt-negate #'identity))
  2172. (builder
  2173. (if negated #'rxt-char-set-intersection #'rxt-choice)))
  2174. (catch 'done
  2175. (while t
  2176. (when (eobp)
  2177. (rxt-error "Missing close right bracket in regexp"))
  2178. (if (and (looking-at (rx "]"))
  2179. (not (= (point) begin)))
  2180. (throw 'done result)
  2181. (let ((piece (funcall transformer (rxt-parse-char-class-piece))))
  2182. (setq result (funcall builder result piece))))))
  2183. (forward-char) ; Skip over closing "]"
  2184. result)))
  2185. ;; Parse a single character, a character range, or a posix class
  2186. ;; within a character set context. Returns an `rxt-char-set'.
  2187. (defun rxt-parse-char-class-piece ()
  2188. (let ((atom (rxt-parse-char-class-atom)))
  2189. (cl-typecase atom
  2190. (rxt-char-set ; return unchanged
  2191. atom)
  2192. (integer ; character: check for range
  2193. (let ((range-end (rxt-maybe-parse-range-end)))
  2194. (if range-end
  2195. (rxt-char-set-union (cons atom range-end))
  2196. (rxt-char-set-union atom))))
  2197. (t ; transform into character set
  2198. (rxt-char-set-union atom)))))
  2199. ;; Parse a single character or named class within a charset.
  2200. ;;
  2201. ;; Returns an integer (a character), a symbol (representing a named
  2202. ;; character class) or an `rxt-char-set' (for pre-defined character
  2203. ;; classes like \d, \W, etc.)
  2204. (defun rxt-parse-char-class-atom ()
  2205. (or
  2206. ;; First, check for PCRE-specific backslash sequences
  2207. (and rxt-parse-pcre
  2208. (rxt-parse-char-class-atom/pcre))
  2209. ;; Char-class syntax
  2210. (rxt-token-case
  2211. ;; Named classes [:alnum:], ...
  2212. (rxt-named-classes-regexp
  2213. (intern (match-string 1)))
  2214. ;; Error on unknown posix-class-like syntax
  2215. ((rx "[:" (* (any "a-z")) ":]")
  2216. (rxt-error "Unknown posix character class `%s'" (match-string 0)))
  2217. ;; Error on [= ... ]= collation syntax
  2218. ((rx "[" (submatch (any "." "="))
  2219. (* (any "a-z")) (backref 1) "]")
  2220. (rxt-error "Unsupported collation syntax `%s'" (match-string 0)))
  2221. ;; Other characters stand for themselves
  2222. ((rx (or "\n" nonl))
  2223. (string-to-char (match-string 0))))))
  2224. ;; Parse backslash escapes inside PCRE character classes
  2225. (defun rxt-parse-char-class-atom/pcre ()
  2226. (or (rxt-parse-escapes/pcre)
  2227. (rxt-token-case
  2228. ;; Backslash + digits => octal char
  2229. ((rx "\\" (submatch (** 1 3 (any "0-7"))))
  2230. (string-to-number (match-string 1) 8))
  2231. ;; Pre-defined character sets
  2232. ((rx "\\" (submatch (any "d" "D" "h" "H" "s" "S" "v" "V" "w" "W")))
  2233. (rxt--pcre-char-set (string-to-char (match-string 1))))
  2234. ;; "\b" inside character classes is a backspace
  2235. ((rx "\\b") ?\C-h)
  2236. ;; Ignore other escapes
  2237. ((rx "\\" (submatch nonl))
  2238. (string-to-char (match-string 1))))))
  2239. ;; Look for a range tail (the "-z" in "a-z") after parsing a single
  2240. ;; character within in a character set. Returns either a character
  2241. ;; representing the range end, or nil.
  2242. (defun rxt-maybe-parse-range-end ()
  2243. (let ((range-end nil) (end-position nil))
  2244. (when (looking-at (rx "-" (not (any "]"))))
  2245. (save-excursion
  2246. (forward-char)
  2247. (setq range-end (rxt-parse-char-class-atom)
  2248. end-position (point))))
  2249. (if (characterp range-end)
  2250. ;; This is a range: move point after it and return the ending character
  2251. (progn
  2252. (goto-char end-position)
  2253. range-end)
  2254. ;; Not a range.
  2255. nil)))
  2256. ;; Return the pre-defined PCRE char-set associated with CHAR: i.e. \d
  2257. ;; is digits, \D non-digits, \s space characters, etc.
  2258. (defun rxt--pcre-char-set (char)
  2259. (let* ((base-char (downcase char))
  2260. (negated (/= char base-char))
  2261. (elements (assoc-default base-char rxt-pcre-char-set-alist))
  2262. (base-char-set (apply #'rxt-char-set-union elements)))
  2263. (if negated
  2264. (rxt-negate base-char-set)
  2265. base-char-set)))
  2266. ;;;; Unparser to `rx' syntax
  2267. (defconst rxt-rx-verbose-equivalents
  2268. '((bol . line-start)
  2269. (eol . line-end)
  2270. (nonl . not-newline)
  2271. (bos . string-start)
  2272. (eos . string-end)
  2273. (bow . word-start)
  2274. (eow . word-end)
  2275. (seq . sequence))
  2276. "Alist of verbose equivalents for short `rx' primitives.")
  2277. (defun rxt-rx-symbol (sym)
  2278. (if rxt-verbose-rx-translation
  2279. (or (assoc-default sym rxt-rx-verbose-equivalents)
  2280. sym)
  2281. sym))
  2282. (defun rxt-adt->rx (re)
  2283. (let ((rx
  2284. (cl-typecase re
  2285. (rxt-primitive
  2286. (rxt-rx-symbol (rxt-primitive-rx re)))
  2287. (rxt-string
  2288. (if (or (not (rxt-string-case-fold re))
  2289. (string= "" (rxt-string-chars re)))
  2290. (rxt-string-chars re)
  2291. `(seq
  2292. ,@(cl-loop for char across (rxt-string-chars re)
  2293. collect `(any ,(upcase char) ,(downcase char))))))
  2294. (rxt-seq
  2295. `(seq ,@(mapcar #'rxt-adt->rx (rxt-seq-elts re))))
  2296. (rxt-choice
  2297. `(or ,@(mapcar #'rxt-adt->rx (rxt-choice-elts re))))
  2298. (rxt-submatch
  2299. (if (rxt-seq-p (rxt-submatch-body re))
  2300. `(submatch
  2301. ,@(mapcar #'rxt-adt->rx (rxt-seq-elts (rxt-submatch-body re))))
  2302. `(submatch ,(rxt-adt->rx (rxt-submatch-body re)))))
  2303. (rxt-submatch-numbered
  2304. (if (rxt-seq-p (rxt-submatch-numbered-p re))
  2305. `(,(rxt-rx-symbol 'submatch-n)
  2306. ,(rxt-submatch-numbered-n re)
  2307. ,@(mapcar #'rxt-adt->rx
  2308. (rxt-seq-elts
  2309. (rxt-submatch-numbered-body re))))
  2310. `(,(rxt-rx-symbol 'submatch-n)
  2311. ,(rxt-submatch-numbered-n re)
  2312. ,(rxt-adt->rx (rxt-submatch-numbered-body re)))))
  2313. (rxt-backref
  2314. (let ((n (rxt-backref-n re)))
  2315. (if (<= n 9)
  2316. `(backref ,(rxt-backref-n re))
  2317. (rxt-error "Too many backreferences (%s)" n))))
  2318. (rxt-syntax-class
  2319. `(syntax ,(rxt-syntax-class-symbol re)))
  2320. (rxt-char-category
  2321. `(category ,(rxt-char-category-symbol re)))
  2322. (rxt-repeat
  2323. (let ((from (rxt-repeat-from re))
  2324. (to (rxt-repeat-to re))
  2325. (greedy (rxt-repeat-greedy re))
  2326. (body (rxt-adt->rx (rxt-repeat-body re))))
  2327. (if rxt-verbose-rx-translation
  2328. (let ((rx
  2329. (cond
  2330. ((and (zerop from) (null to))
  2331. `(zero-or-more ,body))
  2332. ((and (equal from 1) (null to))
  2333. `(one-or-more ,body))
  2334. ((and (zerop from) (equal to 1))
  2335. `(zero-or-one ,body))
  2336. ((null to)
  2337. `(>= ,from ,body))
  2338. ((equal from to)
  2339. `(repeat ,from ,body))
  2340. (t
  2341. `(** ,from ,to ,body)))))
  2342. (if greedy
  2343. (if rxt-explain
  2344. rx ; Readable but not strictly accurate. Fixme?
  2345. `(maximal-match ,rx))
  2346. `(minimal-match ,rx)))
  2347. (cond
  2348. ((and (zerop from) (null to))
  2349. `(,(if greedy '* '*?) ,body))
  2350. ((and (equal from 1) (null to))
  2351. `(,(if greedy '+ '+?) ,body))
  2352. ((and (zerop from) (equal to 1))
  2353. `(,(if greedy ? ??) ,body))
  2354. ((null to)
  2355. `(>= ,from ,body))
  2356. ((equal from to)
  2357. `(= ,from ,body))
  2358. (t
  2359. `(** ,from ,to ,body))))))
  2360. (rxt-char-set-union
  2361. (let* ((case-fold (rxt-char-set-union-case-fold re))
  2362. (re (rxt--simplify-char-set re case-fold))
  2363. (chars (rxt-char-set-union-chars re))
  2364. (ranges (rxt-char-set-union-ranges re))
  2365. (classes (rxt-char-set-union-classes re))
  2366. (case-fold (rxt-char-set-union-case-fold re)))
  2367. (if (and (null chars) (null ranges) (= 1 (length classes)))
  2368. (car classes)
  2369. `(any ,@chars ,@ranges ,@classes))))
  2370. (rxt-char-set-negation
  2371. `(not ,(rxt-adt->rx (rxt-char-set-negation-elt re))))
  2372. (t
  2373. (rxt-error "No RX translation of `%s'" (rxt-to-string re))))))
  2374. ;; Store source information on each fragment of the generated RX
  2375. ;; sexp for rxt-explain mode
  2376. (when rxt-explain
  2377. ;; Use gensyms to store unique source information for multiple
  2378. ;; occurrences of primitives like `bol'
  2379. (when (symbolp rx)
  2380. (setq rx (make-symbol (symbol-name rx))))
  2381. (setf (rxt-location rx) (rxt-location re)))
  2382. rx))
  2383. ;;;; 'Unparser' to PCRE notation
  2384. ;;; Based on scsh/posixstr.scm in scsh
  2385. ;; To ensure that the operator precedence in the generated regexp does
  2386. ;; what we want, we need to keep track of what kind of production is
  2387. ;; returned from each step. Therefore these functions return a string
  2388. ;; and a numeric "level" which lets the function using the generated
  2389. ;; regexp know whether it has to be parenthesized:
  2390. ;;
  2391. ;; 0: an already parenthesized expression
  2392. ;;
  2393. ;; 1: a "piece" binds to any succeeding quantifiers
  2394. ;;
  2395. ;; 2: a "branch", or concatenation of pieces, needs parenthesizing to
  2396. ;; bind to quantifiers
  2397. ;;
  2398. ;; 3: a "top", or alternation of branches, needs parenthesizing to
  2399. ;; bind to quantifiers or to concatenation
  2400. ;;
  2401. ;; This idea is stolen straight out of the scsh implementation.
  2402. (defun rxt-adt->pcre (re)
  2403. (cl-destructuring-bind (s _) (rxt-adt->pcre/lev re) s))
  2404. (defun rxt-adt->pcre/lev (re)
  2405. (cl-typecase re
  2406. (rxt-primitive
  2407. (let ((s (rxt-primitive-pcre re)))
  2408. (if s
  2409. (list s 1)
  2410. (rxt-error "No PCRE translation of `%s'" (rxt-to-string re)))))
  2411. (rxt-string (rxt-string->pcre re))
  2412. (rxt-seq (rxt-seq->pcre re))
  2413. (rxt-choice (rxt-choice->pcre re))
  2414. (rxt-submatch (rxt-submatch->pcre re))
  2415. (rxt-backref
  2416. (list (format "\\%d" (rxt-backref-n re)) 1))
  2417. (rxt-repeat (rxt-repeat->pcre re))
  2418. ((or rxt-char-set-union rxt-char-set-negation)
  2419. (rxt-char-set->pcre re))
  2420. ;; FIXME
  2421. ;; ((rxt-char-set-intersection re) (rxt-char-set-intersection->pcre re))
  2422. (t
  2423. (rxt-error "No PCRE translation of `%s'" (rxt-to-string re)))))
  2424. (defconst rxt-pcre-metachars (rx (any "\\^.$|()[]*+?{}")))
  2425. (defconst rxt-pcre-charset-metachars (rx (any "]" "[" "\\" "^" "-")))
  2426. (defun rxt-string->pcre (re)
  2427. (let ((chars (rxt-string-chars re)))
  2428. (list
  2429. (replace-regexp-in-string
  2430. rxt-pcre-metachars
  2431. "\\\\\\&" chars)
  2432. ;; A one-character string is a 'piece' (it binds to a following
  2433. ;; quantifier). A longer string is a 'branch' (it has to be
  2434. ;; enclosed in parentheses to bind to a following quantifier).
  2435. (if (> (length chars) 1) 2 1))))
  2436. (defun rxt-seq->pcre (re)
  2437. (let ((elts (rxt-seq-elts re)))
  2438. (if (null elts)
  2439. ""
  2440. (rxt-seq-elts->pcre elts))))
  2441. (defun rxt-seq-elts->pcre (elts)
  2442. (cl-destructuring-bind
  2443. (s lev) (rxt-adt->pcre/lev (car elts))
  2444. (if (null (cdr elts))
  2445. (list s lev)
  2446. (cl-destructuring-bind
  2447. (s1 lev1) (rxt-seq-elts->pcre (cdr elts))
  2448. (list (concat (rxt-paren-if-necessary s lev)
  2449. (rxt-paren-if-necessary s1 lev1))
  2450. 2)))))
  2451. (defun rxt-paren-if-necessary (s lev)
  2452. (if (< lev 3)
  2453. s
  2454. (concat "(?:" s ")")))
  2455. (defun rxt-choice->pcre (re)
  2456. (let ((elts (rxt-choice-elts re)))
  2457. (if (null elts)
  2458. nil
  2459. (rxt-choice-elts->pcre elts))))
  2460. (defun rxt-choice-elts->pcre (elts)
  2461. (cl-destructuring-bind
  2462. (s lev) (rxt-adt->pcre/lev (car elts))
  2463. (if (null (cdr elts))
  2464. (list s lev)
  2465. (cl-destructuring-bind
  2466. (s1 lev1) (rxt-choice-elts->pcre (cdr elts))
  2467. (list (concat s "|" s1) 3)))))
  2468. (defun rxt-submatch->pcre (re)
  2469. (cl-destructuring-bind
  2470. (s lev) (rxt-adt->pcre/lev (rxt-submatch-body re))
  2471. (list (concat "(" s ")") 0)))
  2472. (defun rxt-repeat->pcre (re)
  2473. (let ((from (rxt-repeat-from re))
  2474. (to (rxt-repeat-to re))
  2475. (body (rxt-repeat-body re))
  2476. (greedy (rxt-repeat-greedy re)))
  2477. (cl-destructuring-bind
  2478. (s lev) (rxt-adt->pcre/lev body)
  2479. (cond
  2480. ((and to (= from 1) (= to 1)) (list s lev))
  2481. ((and to (= from 0) (= to 0)) (list "" 2))
  2482. (t
  2483. (when (> lev 1) ; parenthesize non-atoms
  2484. (setq s (concat "(?:" s ")")
  2485. lev 0))
  2486. (list (if to
  2487. (cond ((and (= from 0) (= to 1))
  2488. (concat s (if greedy "?" "??")))
  2489. ((= from to)
  2490. (concat s "{" (number-to-string to) "}"))
  2491. (t
  2492. (concat s "{" (number-to-string from)
  2493. "," (number-to-string to) "}")))
  2494. (cond ((= from 0)
  2495. (concat s (if greedy "*" "*?")))
  2496. ((= from 1)
  2497. (concat s (if greedy "+" "+?")))
  2498. (t (concat s "{" (number-to-string from) ",}"))))
  2499. 1))))))
  2500. (defun rxt-char-set->pcre (re)
  2501. (cond ((rxt-char-set-union-p re)
  2502. (list
  2503. (concat "[" (rxt-char-set->pcre/chars re) "]") 1))
  2504. ((rxt-char-set-negation-p re)
  2505. (let ((elt (rxt-char-set-negation-elt re)))
  2506. (if (rxt-char-set-union-p elt)
  2507. (list
  2508. (concat "[^" (rxt-char-set->pcre/chars elt) "]") 1)
  2509. (rxt-error "No PCRE translation of `%s'" (rxt-to-string elt)))))
  2510. (t
  2511. (rxt-error "Non-char-set in rxt-char-set->pcre: %s" re))))
  2512. ;; Fortunately, easier in PCRE than in POSIX!
  2513. (defun rxt-char-set->pcre/chars (re)
  2514. (cl-flet
  2515. ((escape
  2516. (char)
  2517. (let ((s (char-to-string char)))
  2518. (cond ((string-match rxt-pcre-charset-metachars s)
  2519. (concat "\\" s))
  2520. ((and (not (string= s " "))
  2521. (string-match "[^[:graph:]]" s))
  2522. (format "\\x{%x}" char))
  2523. (t s)))))
  2524. (let ((chars (rxt-char-set-union-chars re))
  2525. (ranges (rxt-char-set-union-ranges re))
  2526. (classes (rxt-char-set-union-classes re)))
  2527. (concat
  2528. (mapconcat #'escape chars "")
  2529. (mapconcat #'(lambda (rg)
  2530. (format "%s-%s"
  2531. (escape (car rg))
  2532. (escape (cdr rg))))
  2533. ranges "")
  2534. (mapconcat #'(lambda (class)
  2535. (format "[:%s:]" class))
  2536. classes "")))))
  2537. ;;;; Generate all productions of a finite regexp
  2538. (defun rxt-adt->strings (re)
  2539. (cl-typecase re
  2540. (rxt-primitive
  2541. (list ""))
  2542. (rxt-string
  2543. (list (rxt-string-chars re)))
  2544. (rxt-seq
  2545. (rxt-seq-elts->strings (rxt-seq-elts re)))
  2546. (rxt-choice
  2547. (rxt-choice-elts->strings (rxt-choice-elts re)))
  2548. (rxt-submatch
  2549. (rxt-adt->strings (rxt-submatch-body re)))
  2550. (rxt-submatch-numbered
  2551. (rxt-adt->strings (rxt-submatch-numbered-body re)))
  2552. (rxt-repeat
  2553. (rxt-repeat->strings re))
  2554. (rxt-char-set-union
  2555. (rxt-char-set->strings re))
  2556. (t
  2557. (error "Can't generate productions of %s"
  2558. (rxt-syntax-tree-readable re)))))
  2559. (defun rxt-concat-product (heads tails)
  2560. (cl-mapcan
  2561. (lambda (hs)
  2562. (mapcar
  2563. (lambda (ts) (concat hs ts))
  2564. tails))
  2565. heads))
  2566. (defun rxt-seq-elts->strings (elts)
  2567. (if (null elts)
  2568. '("")
  2569. (let ((heads (rxt-adt->strings (car elts)))
  2570. (tails (rxt-seq-elts->strings (cdr elts))))
  2571. (rxt-concat-product heads tails))))
  2572. (defun rxt-choice-elts->strings (elts)
  2573. (if (null elts)
  2574. '()
  2575. (append (rxt-adt->strings (car elts))
  2576. (rxt-choice-elts->strings (cdr elts)))))
  2577. (defun rxt-repeat->strings (re)
  2578. (let ((from (rxt-repeat-from re))
  2579. (to (rxt-repeat-to re)))
  2580. (if (not to)
  2581. (error "Can't generate all productions of unbounded repeat \"%s\""
  2582. (rxt-syntax-tree-readable re))
  2583. (let ((strings (rxt-adt->strings (rxt-repeat-body re))))
  2584. (rxt-repeat-n-m->strings from to strings)))))
  2585. (defun rxt-repeat-n-m->strings (from to strings)
  2586. (cond
  2587. ((zerop to) '(""))
  2588. ((= to from) (rxt-repeat-n->strings from strings))
  2589. (t ; to > from
  2590. (let* ((strs-n (rxt-repeat-n->strings from strings))
  2591. (accum (cl-copy-list strs-n)))
  2592. (dotimes (i (- to from))
  2593. (setq strs-n (rxt-concat-product strs-n strings))
  2594. (setq accum (nconc accum strs-n)))
  2595. accum))))
  2596. (defun rxt-repeat-n->strings (n strings)
  2597. ;; n > 1
  2598. (cond ((zerop n) '(""))
  2599. ((= n 1) strings)
  2600. (t
  2601. (rxt-concat-product
  2602. (rxt-repeat-n->strings (- n 1) strings)
  2603. strings))))
  2604. (defun rxt-char-set->strings (re)
  2605. (if (rxt-char-set-union-classes re)
  2606. (error "Can't generate all productions of named character classes in \"%s\""
  2607. (rxt-syntax-tree-readable re))
  2608. (let ((chars (mapcar #'char-to-string (rxt-char-set-union-chars re))))
  2609. (dolist (range (rxt-char-set-union-ranges re))
  2610. (let ((end (cdr range)))
  2611. (cl-do ((i (car range) (+ i 1)))
  2612. ((> i end))
  2613. (push (char-to-string i) chars))))
  2614. chars)))
  2615. ;;;; RE-Builder hacks
  2616. (defadvice reb-update-modestring
  2617. (after rxt () activate compile)
  2618. "This function is hacked for emulated PCRE syntax and regexp conversion."
  2619. (setq reb-mode-string
  2620. (concat
  2621. (format " (%s)" reb-re-syntax)
  2622. reb-mode-string))
  2623. (force-mode-line-update))
  2624. (defadvice reb-change-syntax
  2625. (around rxt (&optional syntax) activate compile)
  2626. "This function is hacked for emulated PCRE syntax and regexp conversion."
  2627. (interactive
  2628. (list (intern
  2629. (completing-read (format "Select syntax (%s): " reb-re-syntax)
  2630. '(read string pcre sregex rx)
  2631. nil t "" nil (symbol-name reb-re-syntax)))))
  2632. (unless (memq syntax '(read string pcre lisp-re sregex rx))
  2633. (error "Invalid syntax: %s" syntax))
  2634. (let ((re-builder-buffer (get-buffer reb-buffer)))
  2635. (setq reb-re-syntax syntax)
  2636. (when re-builder-buffer
  2637. (with-current-buffer reb-target-buffer
  2638. (cl-case syntax
  2639. (rx
  2640. (let ((rx (rxt-elisp-to-rx reb-regexp)))
  2641. (setq reb-regexp-src
  2642. (with-temp-buffer
  2643. (insert "\n" "'")
  2644. (rxt-print rx)
  2645. (buffer-string)))))
  2646. (pcre
  2647. (setq reb-regexp-src (rxt-elisp-to-pcre reb-regexp)))))
  2648. (with-current-buffer re-builder-buffer
  2649. ;; Hack: prevent reb-auto-update from clobbering the
  2650. ;; reb-regexp-src we just set
  2651. (let ((inhibit-modification-hooks t))
  2652. (reb-initialize-buffer))
  2653. ;; Enable flag-toggling bindings for PCRE syntax
  2654. (rxt--re-builder-switch-pcre-mode)))))
  2655. (defadvice reb-read-regexp
  2656. (around rxt () activate compile)
  2657. "This function is hacked for emulated PCRE syntax and regexp conversion."
  2658. (if (eq reb-re-syntax 'pcre)
  2659. (setq ad-return-value
  2660. (save-excursion
  2661. (goto-char (point-min))
  2662. (rxt-read-delimited-pcre)))
  2663. ad-do-it))
  2664. (defadvice reb-insert-regexp
  2665. (around rxt () activate compile)
  2666. "This function is hacked for emulated PCRE syntax and regexp conversion."
  2667. (if (eq reb-re-syntax 'pcre)
  2668. (let ((src (reb-target-binding reb-regexp-src)))
  2669. (if src
  2670. (insert "\n/" (replace-regexp-in-string "/" "\\/" src t t) "/")
  2671. (insert "\n//")))
  2672. ad-do-it))
  2673. (defadvice reb-cook-regexp
  2674. (around rxt (re) activate compile)
  2675. "This function is hacked for emulated PCRE syntax and regexp conversion."
  2676. (if (eq reb-re-syntax 'pcre)
  2677. (setq ad-return-value (rxt-pcre-to-elisp re))
  2678. ad-do-it))
  2679. (defadvice reb-update-regexp
  2680. (around rxt () activate compile)
  2681. "This function is hacked for emulated PCRE syntax and regexp conversion."
  2682. (setq ad-return-value
  2683. (let* ((re-src (reb-read-regexp))
  2684. (re (reb-cook-regexp re-src)))
  2685. (with-current-buffer reb-target-buffer
  2686. (let ((oldre reb-regexp))
  2687. (prog1
  2688. (not (string= oldre re))
  2689. (setq reb-regexp re)
  2690. ;; Update the source re if format requires translation
  2691. (when (or (reb-lisp-syntax-p) (eq reb-re-syntax 'pcre))
  2692. (setq reb-regexp-src re-src))))))))
  2693. (defun rxt--re-builder-switch-pcre-mode ()
  2694. (rxt--read-pcre-mode
  2695. (if (eq reb-re-syntax 'pcre) 1 0)))
  2696. (add-hook 'reb-mode-hook #'rxt--re-builder-switch-pcre-mode)
  2697. (provide 'rxt)
  2698. (provide 'pcre2el)
  2699. ;;; pcre2el.el ends here