silo:std.matcher

Source src/silo:std.matcher

1##! User-extensible search-pattern interface.
2##!
3##! `Matcher` abstracts "things that can answer whether, and where, an
4##! element matches inside a haystack". It's the foundation for
5##! pluggable `.find` / `.contains` / `.split` on `Str` and any
6##! `Seq`-like type. This module supplies the trait, four built-in
7##! impls (`Str`, `Codepoint`, `(Vec Codepoint)`, `(Vec U8)`), and a
8##! blanket impl for quotations.
9##!
10##! `Matcher` carries both `elem` and `haystack` as primary parameters
11##! today; the `CodepointMatcher` / `ByteMatcher` constraint aliases
12##! recover the single-parameter shape. Byte-index ranges use
13##! `(Range AnyInt)` and the byte matcher's haystack and element are
14##! `(Vec AnyInt) / AnyInt` — both sharpen to the narrower `AnyUInt`
15##! and `U8` shapes once the corresponding checker plumbing lands.
16
17:use
18  :open core
19    AnyInt AnyUInt Bool Byte Codepoint Option None Some Range RangeBoth
20    Str U8 Unit Vec
21  :open traits Eq Ord Add Not Default
22  :open codepoint Ascii
23  :open collections Foldable
24:end
25
26# si[impl stdlib.matcher.trait]
27## Search-pattern interface. `Self` is the matcher (e.g. a literal
28## needle, a codepoint, a character class). `haystack` is the input
29## sequence type. `elem` is the element type of the haystack.
30##
31## `.find-in` returns the byte/index range of the first match.
32## `.find-all-in` returns the non-overlapping match ranges in order.
33:trait(pub) Matcher haystack elem
34  ## Byte/index range of the first match, or `None` if the matcher
35  ## finds no occurrence in the input.
36  .find-in ( Self haystack -> (Option (Range AnyInt)) )
37  ## All non-overlapping match ranges in input order.
38  .find-all-in ( Self haystack -> (Vec (Range AnyInt)) )
39:end
40
41## Convenience constraint alias: "any matcher whose haystack is `Str`
42## and whose element type is `Codepoint`". Used by `.find` / `.split`
43## on `Str` once the follow-up WP rewires them.
44:trait(pub) CodepointMatcher { (Matcher Self Str Codepoint) } :end
45
46## Convenience constraint alias: "any matcher whose haystack is a
47## byte `Vec` and whose element type is an `AnyInt` in the byte range".
48:trait(pub) ByteMatcher { (Matcher Self (Vec AnyInt) AnyInt) } :end
49
50# si[impl stdlib.matcher.str-literal]
51## `Str` is a matcher that matches itself byte-identically inside
52## another `Str`. `.find-in` delegates to the host `str-find` intrinsic
53## and shapes the byte offset into `Some (RangeBoth start end)` or
54## `None`. `.find-all-in` walks the input with successive slices.
55:impl Matcher Str Str Codepoint
56  .find-in ( Str Str -> (Option (Range AnyInt)) )
57    pop-> haystack pop-> needle
58    needle haystack str-find pop-> pos
59    pos 0 < :if
60      None
61    :else
62      pos pos needle str-len + RangeBoth Some
63    :end ;
64
65  .find-all-in ( Str Str -> (Vec (Range AnyInt)) )
66    pop-> haystack pop-> needle
67    needle str-len pop-> nlen
68    haystack str-len pop-> hlen
69    nlen 0 = :if (Vec .default) :ret :end
70    (Vec .default) 0    # stack: acc cursor
71    :loop
72      # stack: acc cursor
73      dup hlen:if drop :break :end
74      pop-> cursor
75      needle cursor hlen haystack str-slice str-find pop-> rel
76      rel 0 < :if :break :end
77      # acc is still on stack; we need: acc' = acc .push (RangeBoth start end), cursor' = end
78      cursor rel + pop-> start
79      start nlen + pop-> end
80      start end RangeBoth .push   # stack: acc'
81      end                         # stack: acc' cursor'
82    :end ;
83:end
84
85# si[impl stdlib.matcher.codepoint-single]
86## A `Codepoint` is a matcher that matches its UTF-8-encoded single
87## character inside a `Str`. `.find-in` encodes the codepoint to a
88## one-char `Str` and delegates to `str-find`.
89:impl Matcher Codepoint Str Codepoint
90  .find-in ( Codepoint Str -> (Option (Range AnyInt)) )
91    pop-> haystack
92    char-to-str pop-> needle
93    needle haystack str-find pop-> pos
94    pos 0 < :if
95      None
96    :else
97      pos pos needle str-len + RangeBoth Some
98    :end ;
99
100  .find-all-in ( Codepoint Str -> (Vec (Range AnyInt)) )
101    pop-> haystack
102    char-to-str pop-> needle
103    needle str-len pop-> nlen
104    haystack str-len pop-> hlen
105    (Vec .default) 0    # stack: acc cursor
106    :loop
107      dup hlen:if drop :break :end
108      pop-> cursor
109      needle cursor hlen haystack str-slice str-find pop-> rel
110      rel 0 < :if :break :end
111      cursor rel + pop-> start
112      start nlen + pop-> end
113      start end RangeBoth .push
114      end
115    :end ;
116:end
117
118# si[impl stdlib.matcher.codepoint-set]
119## A `(Vec Codepoint)` is a matcher that matches any codepoint in the
120## set. Walks the haystack by byte offset via `str-next-char`.
121:impl Matcher (Vec Codepoint) Str Codepoint
122  .find-in ( (Vec Codepoint) Str -> (Option (Range AnyInt)) )
123    pop-> haystack pop-> set
124    haystack str-len pop-> hlen
125    0 pop-> cursor
126    None        # stack: result
127    :loop
128      cursor hlen:if :break :end
129      cursor haystack str-next-char pop-> ch pop-> next-pos
130      set [ pop-> other ch char-to-int other char-to-int = ] .any :if
131        drop                                     # drop old None
132        cursor next-pos RangeBoth Some           # push new Some(..)
133        :break
134      :end
135      next-pos pop-> cursor
136    :end ;
137
138  .find-all-in ( (Vec Codepoint) Str -> (Vec (Range AnyInt)) )
139    pop-> haystack pop-> set
140    haystack str-len pop-> hlen
141    0 pop-> cursor
142    (Vec .default)    # stack: acc
143    :loop
144      cursor hlen:if :break :end
145      cursor haystack str-next-char pop-> ch pop-> next-pos
146      set [ pop-> other ch char-to-int other char-to-int = ] .any :if
147        cursor next-pos RangeBoth .push    # mutates acc on stack
148      :end
149      next-pos pop-> cursor
150    :end ;
151:end
152
153# si[impl stdlib.matcher.byte-seq]
154## A byte vector is a matcher that matches a byte-level substring
155## inside a byte-vector haystack. Uses a helper fn
156## `byte-substr-matches-at?` because nested quotations that close over
157## vector locals move-consume those locals each iteration (captures
158## are `std::mem::replace`-with-0 per the abstract-machine RC design);
159## a plain :fn receiving Vec arguments on the stack avoids that hazard.
160:fn byte-substr-matches-at? ( AnyInt (Vec AnyInt) (Vec AnyInt) -> Bool )
161  pop-> needle pop-> haystack pop-> i
162  needle .len pop-> nlen
163  0 pop-> j
164  true      # stack: [matched]
165  :loop
166    j nlen:if :break :end
167    i j + haystack .get j needle .get = not :if
168      drop false
169      :break
170    :end
171    j 1 + pop-> j
172  :end
173:end
174
175:impl Matcher (Vec AnyInt) (Vec AnyInt) AnyInt
176  .find-in ( (Vec AnyInt) (Vec AnyInt) -> (Option (Range AnyInt)) )
177    pop-> haystack pop-> needle
178    needle .len pop-> nlen
179    haystack .len pop-> hlen
180    nlen 0 = :if 0 0 RangeBoth Some :ret :end
181    0 pop-> i
182    None        # stack: [result]
183    :loop
184      i nlen + hlen > :if :break :end
185      i haystack needle byte-substr-matches-at? :if
186        drop
187        i i nlen + RangeBoth Some
188        :break
189      :end
190      i 1 + pop-> i
191    :end ;
192
193  .find-all-in ( (Vec AnyInt) (Vec AnyInt) -> (Vec (Range AnyInt)) )
194    pop-> haystack pop-> needle
195    needle .len pop-> nlen
196    haystack .len pop-> hlen
197    nlen 0 = :if (Vec .default) :ret :end
198    # Note: this implementation yields overlapping matches (advances
199    # by 1 on hit, not by nlen) because non-overlapping advance
200    # requires rebinding the loop index inside a conditional, which
201    # the current checker does not preserve past the branch. Tighten
202    # to non-overlapping once local-rebinding-inside-conditional
203    # lands or the impl is rewritten as an iterator.
204    (Vec .default)    # stack: [acc]
205    0 pop-> i
206    :loop
207      i nlen + hlen > :if :break :end
208      i haystack needle byte-substr-matches-at? :if
209        i i nlen + RangeBoth .push    # mutate acc on stack
210      :end
211      i 1 + pop-> i
212    :end ;
213:end
214
215# si[impl stdlib.matcher.quotation]
216## Blanket impl: any quotation `[elem -> Bool]` is itself a matcher
217## against a `(Vec elem)` haystack. Interprets "match at position i"
218## as "quot applied to element i returns true".
219##
220## Spec canonical shape is
221## `[ (Seq elem _) AnyUInt -> (Option AnyUInt) ]` — a next-match
222## driver that receives the remaining-haystack view and a starting
223## offset and returns the end offset of the match. That shape lets
224## matchers scan arbitrarily forward. Today the checker cannot
225## dispatch blanket impls whose `Self` is a quotation type of that
226## full shape, and structural-search-over-Seq-existentials is not
227## expressible either, so this impl lands in a simpler predicate
228## flavour: `[elem -> Bool]` walking a `(Vec elem)` haystack one
229## element at a time. The driver-style blanket impl will land once
230## those checker features arrive.
231:impl Matcher [elem -> Bool] (Vec elem) elem
232  .find-in ( [elem -> Bool] (Vec elem) -> (Option (Range AnyInt)) )
233    pop-> haystack pop-> pred
234    haystack .len pop-> hlen
235    0 pop-> i
236    None    # stack: result
237    :loop
238      i hlen:if :break :end
239      i haystack .get pred .call :if
240        drop
241        i i 1 + RangeBoth Some
242        :break
243      :end
244      i 1 + pop-> i
245    :end ;
246
247  .find-all-in ( [elem -> Bool] (Vec elem) -> (Vec (Range AnyInt)) )
248    pop-> haystack pop-> pred
249    haystack .len pop-> hlen
250    0 pop-> i
251    (Vec .default)    # stack: acc
252    :loop
253      i hlen:if :break :end
254      i haystack .get pred .call :if
255        i i 1 + RangeBoth .push
256      :end
257      i 1 + pop-> i
258    :end ;
259:end