Module regex

A library for parsing, compiling, and executing regular expressions. The match time is linear with respect to the length of the input and the regular expression. So, it can handle input from untrusted users. Its syntax is similar to PCRE but lacks a few features that can not be implemented while keeping the space/time complexity guarantees, i.e.: backreferences and look-around assertions.

Syntax

Matching one character

.          any character except new line (includes new line with s flag)
\d         digit (\p{Nd})
\D         not digit
\pN        One-letter name Unicode character class
\p{Greek}  Unicode character class (general category or script)
\PN        Negated one-letter name Unicode character class
\P{Greek}  negated Unicode character class (general category or script)

Character classes

[xyz]         A character class matching either x, y or z (union).
[^xyz]        A character class matching any character except x, y and z.
[a-z]         A character class matching any character in range a-z.
[[:alpha:]]   ASCII character class ([A-Za-z])
[[:^alpha:]]  Negated ASCII character class ([^A-Za-z])
[\[\]]        Escaping in character classes (matching [ or ])

Composites

xy   concatenation (x followed by y)
x|y  alternation (x or y, prefer x)

Repetitions

x*       zero or more of x (greedy)
x+       one or more of x (greedy)
x?       zero or one of x (greedy)
x*?      zero or more of x (ungreedy/lazy)
x+?      one or more of x (ungreedy/lazy)
x??      zero or one of x (ungreedy/lazy)
x{n,m}   at least n x and at most m x (greedy)
x{n,}    at least n x (greedy)
x{n}     exactly n x
x{n,m}?  at least n x and at most m x (ungreedy/lazy)
x{n,}?   at least n x (ungreedy/lazy)
x{n}?    exactly n x

Empty matches

^   the beginning of text (or start-of-line with multi-line mode)
$   the end of text (or end-of-line with multi-line mode)
\A  only the beginning of text (even with multi-line mode enabled)
\z  only the end of text (even with multi-line mode enabled)
\b  a Unicode word boundary (\w on one side and \W, \A, or \z on other)
\B  not a Unicode word boundary

Grouping and flags

(exp)          numbered capture group (indexed by opening parenthesis)
(?P<name>exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z])
(?:exp)        non-capturing group
(?flags)       set flags within current group
(?flags:exp)   set flags for exp (non-capturing)

Flags are each a single character. For example, (?x) sets the flag x and (?-x) clears the flag x. Multiple flags can be set or cleared at the same time: (?xy) sets both the x and y flags, (?x-y) sets the x flag and clears the y flag, and (?-xy) clears both the x and y flags.

i  case-insensitive: letters match both upper and lower case
m  multi-line mode: ^ and $ match begin/end of line
s  allow . to match \L (new line)
U  swap the meaning of x* and x*? (un-greedy mode)
u  Unicode support (enabled by default)
x  ignore whitespace and allow line comments (starting with `#`)

All flags are disabled by default unless stated otherwise

Escape sequences

\*         literal *, works for any punctuation character: \.+*?()|[]{}^$
\a         bell (\x07)
\f         form feed (\x0C)
\t         horizontal tab
\n         new line (\L)
\r         carriage return
\v         vertical tab (\x0B)
\123       octal character code (up to three digits)
\x7F       hex character code (exactly two digits)
\x{10FFFF} any hex character code corresponding to a Unicode code point
\u007F     hex character code (exactly four digits)
\U0010FFFF hex character code (exactly eight digits)

Perl character classes (Unicode friendly)

These classes are based on the definitions provided in UTS#18

\d  digit (\p{Nd})
\D  not digit
\s  whitespace (\p{White_Space})
\S  not whitespace
\w  word character (\p{Alphabetic} + \p{M} + \d + \p{Pc} + \p{Join_Control})
\W  not word character

ASCII character classes

[[:alnum:]]   alphanumeric ([0-9A-Za-z])
[[:alpha:]]   alphabetic ([A-Za-z])
[[:ascii:]]   ASCII ([\x00-\x7F])
[[:blank:]]   blank ([\t ])
[[:cntrl:]]   control ([\x00-\x1F\x7F])
[[:digit:]]   digits ([0-9])
[[:graph:]]   graphical ([!-~])
[[:lower:]]   lower case ([a-z])
[[:print:]]   printable ([ -~])
[[:punct:]]   punctuation ([!-/:-@\[-`{-~])
[[:space:]]   whitespace ([\t\n\v\f\r ])
[[:upper:]]   upper case ([A-Z])
[[:word:]]    word characters ([0-9A-Za-z_])
[[:xdigit:]]  hex digit ([0-9A-Fa-f])

Types

RegexError = object of ValueError
Regex = object
  states: seq[Node]
  groupsCount: int16
  namedGroups: Table[string, int16]
a compiled regular expression
RegexMatch = object
  captures: seq[Slice[int]]
  groups: seq[Slice[int]]
  namedGroups: Table[string, int16]
  boundaries*: Slice[int]
result from matching operations

Procs

proc group(m: RegexMatch; i: int): seq[Slice[int]] {...}{.raises: [], tags: [].}
return slices for a given group. Use the iterator version if you care about performance
proc group(m: RegexMatch; s: string): seq[Slice[int]] {...}{.raises: [KeyError], tags: [].}
return slices for a given named group. Use the iterator version if you care about performance
proc groupsCount(m: RegexMatch): int {...}{.raises: [], tags: [].}
return the number of capturing groups
block:
  var m: RegexMatch
  doAssert "ab".match(re"(a)(b)", m)
  doAssert m.groupsCount == 2
block:
  var m: RegexMatch
  doAssert "ab".match(re"((ab))", m)
  doAssert m.groupsCount == 2
proc match(s: string; pattern: Regex; m: var RegexMatch; start = 0): bool {...}{.raises: [],
    tags: [].}
return a match if the whole string matches the regular expression. This is similar to find(text, re"^regex$") but has better performance
var m: RegexMatch
doAssert "abcd".match(re"abcd", m)
doAssert(not "abcd".match(re"abc", m))
proc contains(s: string; pattern: Regex): bool {...}{.raises: [], tags: [].}
search for the pattern anywhere in the string. It returns as soon as there is a match, even when the expression has repetitions. Use re"^regex$" to match the whole string instead of searching
doAssert(re"bc" in "abcd")
doAssert(re"(23)+" in "23232")
doAssert(re"^(23)+$" notin "23232")
proc find(s: string; pattern: Regex; m: var RegexMatch; start = 0): bool {...}{.raises: [],
    tags: [].}
search through the string looking for the first location where there is a match
var m: RegexMatch
doAssert "abcd".find(re"bc", m)
doAssert(not "abcd".find(re"de", m))
doAssert "2222".find(re"(22)*", m) and
  m.group(0) == @[0 .. 1, 2 .. 3]
proc findAll(s: string; pattern: Regex; start = 0): seq[RegexMatch] {...}{.raises: [], tags: [].}
search through the string and return each match. Empty matches (start > end) are included
proc findAndCaptureAll(s: string; pattern: Regex): seq[string] {...}{.raises: [], tags: [].}
search through the string and return a seq with captures.
let
  expected = @["1", "2", "3", "4", "5"]
  res = findAndCaptureAll("a1b2c3d4e5", re"\d")
doAssert(res == expected)
proc split(s: string; sep: Regex): seq[string] {...}{.raises: [], tags: [].}
return not matched substrings
doAssert(split("11a22Ϊ33Ⓐ44弢55", re"\d+") ==
  @["", "a", "Ϊ", "Ⓐ", "弢", ""])
proc splitIncl(s: string; sep: Regex): seq[string] {...}{.raises: [], tags: [].}
return not matched substrings, including captured groups
doAssert splitIncl("a,b", re"(,)") ==
  @["a", ",", "b"]
proc startsWith(s: string; pattern: Regex; start = 0): bool {...}{.raises: [], tags: [].}
return whether the string starts with the pattern or not
doAssert("abc".startsWith(re"\w"))
doAssert(not "abc".startsWith(re"\d"))
proc endsWith(s: string; pattern: Regex): bool {...}{.raises: [], tags: [].}
return whether the string ends with the pattern or not
doAssert("abc".endsWith(re"\w"))
doAssert(not "abc".endsWith(re"\d"))
proc replace(s: string; pattern: Regex; by: string; limit = 0): string {...}{.
    raises: [ValueError], tags: [].}

Replace matched substrings.

Matched groups can be accessed with $N notation, where N is the group's index, starting at 1 (1-indexed). $$ means literal $.

If limit is given, at most limit replacements are done. limit of 0 means there is no limit

doAssert("aaa".replace(re"a", "b", 1) == "baa")
doAssert("abc".replace(re"(a(b)c)", "m($1) m($2)") ==
  "m(abc) m(b)")
doAssert("Nim is awesome!".replace(re"(\w\B)", "$1_") ==
  "N_i_m i_s a_w_e_s_o_m_e!")
proc replace(s: string; pattern: Regex; by: proc (m: RegexMatch; s: string): string;
            limit = 0): string {...}{.raises: [], tags: [].}

Replace matched substrings.

If limit is given, at most limit replacements are done. limit of 0 means there is no limit

proc removeEvenWords(m: RegexMatch, s: string): string =
  result = ""
  if m.group(1).len mod 2 != 0:
    result = s[m.group(0)[0]]

let
  text = "Es macht Spaß, alle geraden Wörter zu entfernen!"
  expected = "macht , geraden entfernen!"
doAssert(text.replace(re"((\w)+\s*)", removeEvenWords) == expected)
proc toPattern(s: string): Regex {...}{.raises: [RegexError], tags: [].}
Parse and compile a regular expression.
# compiled at run-time
let patternA = toPattern(r"aa?")
# compiled at compile-time
const patternB = toPattern(r"aa?")

Iterators

iterator group(m: RegexMatch; i: int): Slice[int] {...}{.raises: [], tags: [].}
return slices for a given group. Slices of start > end are empty matches (i.e.: re"(\d?)") and they are included same as in PCRE.
let
  expected = ["a", "b", "c"]
  text = "abc"
var m: RegexMatch
doAssert text.match(re"(\w)+", m)
var i = 0
for bounds in m.group(0):
  doAssert(expected[i] == text[bounds])
  inc i
iterator group(m: RegexMatch; s: string): Slice[int] {...}{.raises: [KeyError], tags: [].}
return slices for a given named group
let
  expected = ["a", "b", "c"]
  text = "abc"
var m: RegexMatch
doAssert text.match(re"(?P<foo>\w)+", m)
var i = 0
for bounds in m.group("foo"):
  doAssert(expected[i] == text[bounds])
  inc i
iterator findAll(s: string; pattern: Regex; start = 0): RegexMatch {...}{.inline, raises: [],
    tags: [].}
search through the string and return each match. Empty matches (start > end) are included
var i = 0
let
  expected = [1 .. 2, 4 .. 5]
  text = "abcabc"
for m in findAll(text, re"bc"):
  doAssert text[m.boundaries] == "bc"
  doAssert m.boundaries == expected[i]
  inc i
iterator split(s: string; sep: Regex): string {...}{.inline, raises: [], tags: [].}
return not matched substrings
var i = 0
let expected = ["", "a", "Ϊ", "Ⓐ", "弢", ""]
for s in split("11a22Ϊ33Ⓐ44弢55", re"\d+"):
  doAssert(s == expected[i])
  inc i

Templates

template re(s: string): Regex
Parse and compile a regular expression at compile-time