silo:std.str

Source src/silo:std.str

1##! Stdlib `Str` methods — prefix/suffix, search, case, padding, repetition, joining, UTF conversion, trimming, splitting.
2##!
3##! Core operations (`.len`, `.bytes`, `.slice`, `+`, `split`,
4##! `contains`, `trim`, `format`) live on the `Str` lang-item and its
5##! host intrinsics; this module adds the rest directly on `:impl Str`.
6
7:use
8  :open core AnyInt AnyUInt Bool Bytes Codepoint Option None Some Pair Range Result Ok Err Str Vec
9  :open matcher CodepointMatcher
10:end
11
12:impl Str
13  # si[impl str.starts-with]
14  ## `.starts-with` — true iff the receiver starts with the prefix.
15  .starts-with ( Str Str -> Bool )
16    str-starts-with ;
17
18  # si[impl str.ends-with]
19  ## `.ends-with` — true iff the receiver ends with the suffix.
20  .ends-with ( Str Str -> Bool )
21    str-ends-with ;
22
23  # si[impl str.strip-prefix]
24  ## `.strip-prefix` — remove the prefix if present; `Some rest` or `None`.
25  .strip-prefix ( Str Str -> (Option Str) )
26    str-strip-prefix ;
27
28  # si[impl str.strip-suffix]
29  ## `.strip-suffix` — remove the suffix if present; `Some rest` or `None`.
30  .strip-suffix ( Str Str -> (Option Str) )
31    str-strip-suffix ;
32
33  # si[impl str.find+1]
34  ## `.find` — byte offset of the first occurrence of `m` in the receiver,
35  ## or `None` if `m` does not match. Accepts any `(CodepointMatcher m)`:
36  ## the built-in `Str`, `Codepoint`, `(Vec Codepoint)`, or a predicate
37  ## quotation. Delegates to `.find-in` and projects the start offset.
38  .find ( Str m -> (Option AnyInt) ) { (CodepointMatcher m) }
39    swap .find-in
40    :match
41      | None   => None
42      | Some r => r :match
43        | RangeBoth s _ => s Some
44        | RangeFrom s   => s Some
45        | RangeTo _     => 0 Some
46        | RangeFull     => 0 Some
47      :end
48    :end ;
49
50  # si[impl str.rfind]
51  ## `.rfind` — byte offset of the last occurrence, or -1 if not found.
52  ## Still takes a literal `Str` pending a backward-scanning Matcher variant.
53  .rfind ( Str Str -> AnyInt )
54    str-rfind ;
55
56  # si[impl string.contains+1]
57  ## `.contains` — `true` iff `m` matches anywhere in the receiver. Accepts
58  ## any `(CodepointMatcher m)`; delegates to `.find` and projects presence.
59  .contains ( Str m -> Bool ) { (CodepointMatcher m) }
60    .find .is-some ;
61
62  # si[impl str.replace]
63  ## `.replace` — replace first occurrence of target with replacement.
64  .replace ( Str Str Str -> Str )
65    str-replace ;
66
67  # si[impl str.replace-all]
68  ## `.replace-all` — replace every non-overlapping occurrence.
69  .replace-all ( Str Str Str -> Str )
70    str-replace-all ;
71
72  # si[impl str.to-upper]
73  ## `.to-upper` — Unicode uppercase (via Rust `std`).
74  .to-upper ( Str -> Str )
75    str-to-upper ;
76
77  # si[impl str.to-lower]
78  ## `.to-lower` — Unicode lowercase (via Rust `std`).
79  .to-lower ( Str -> Str )
80    str-to-lower ;
81
82  # si[impl str.to-ascii-upper]
83  ## `.to-ascii-upper` — ASCII-only uppercase; non-ASCII bytes pass through.
84  .to-ascii-upper ( Str -> Str )
85    str-to-ascii-upper ;
86
87  # si[impl str.to-ascii-lower]
88  ## `.to-ascii-lower` — ASCII-only lowercase; non-ASCII bytes pass through.
89  .to-ascii-lower ( Str -> Str )
90    str-to-ascii-lower ;
91
92  # si[impl str.is-ascii]
93  ## `.is-ascii` — true iff every byte is < 128.
94  .is-ascii ( Str -> Bool )
95    str-is-ascii ;
96
97  # si[impl str.is-empty]
98  ## `.is-empty` — true iff the byte length is zero.
99  .is-empty ( Str -> Bool )
100    str-is-empty ;
101
102  # si[impl str.repeat]
103  ## `.repeat` — concatenate the receiver N times (0 or negative yields "").
104  .repeat ( Str AnyInt -> Str )
105    str-repeat ;
106
107  # si[impl str.char-at]
108  ## `.char-at` — nth codepoint (0-indexed) as `(Option Codepoint)`.
109  .char-at ( Str AnyInt -> (Option Codepoint) )
110    str-char-at-opt ;
111
112  # si[impl str.byte-at]
113  ## `.byte-at` — byte at index as `(Option AnyUInt)` (the byte value 0..256).
114  .byte-at ( Str AnyInt -> (Option AnyUInt) )
115    str-byte-at-opt ;
116
117  # si[impl str.lines]
118  ## `.lines` — split by newline (`\n`, `\r\n`, `\r`); trailing newline does
119  ## not emit an extra empty element.
120  .lines ( Str -> (Vec Str) )
121    str-lines ;
122
123  # si[impl str.words]
124  ## `.words` — split on Unicode whitespace, dropping empties.
125  .words ( Str -> (Vec Str) )
126    str-words ;
127
128  # si[impl str.split]
129  ## `.split` — split by a `Str` separator into a `(Vec Str)`. Mirrors the
130  ## core `split` word with `.method` dispatch.
131  .split ( Str Str -> (Vec Str) )
132    str-split ;
133
134  # si[impl str.split-once]
135  ## `.split-once` — split at the first occurrence of a `Str` separator,
136  ## returning `Some (Pair left right)` (separator excluded) or `None` when
137  ## the separator is absent. This is the literal-`Str` form; the full spec
138  ## signature uses a `Matcher` constraint, which is deferred until Matcher
139  ## trait rewiring lands.
140  .split-once ( Str Str -> (Option (Pair Str Str)) )
141    str-split-once ;
142
143  # si[impl str.split-whitespace]
144  ## `.split-whitespace` — split on runs of Unicode whitespace, dropping
145  ## empties. Equivalent in content to `.words`; returned eagerly as a
146  ## `(Vec Str)` pending a lazy iterator surface for `Str` sources.
147  .split-whitespace ( Str -> (Vec Str) )
148    str-split-whitespace ;
149
150  # si[impl str.trim]
151  ## `.trim` — trim Unicode `White_Space` from both ends.
152  .trim ( Str -> Str )
153    str-trim-unicode ;
154
155  # si[impl str.trim-start]
156  ## `.trim-start` — trim Unicode `White_Space` from the start only.
157  .trim-start ( Str -> Str )
158    str-trim-start ;
159
160  # si[impl str.trim-end]
161  ## `.trim-end` — trim Unicode `White_Space` from the end only.
162  .trim-end ( Str -> Str )
163    str-trim-end ;
164
165  # si[impl str.pad-left]
166  ## `.pad-left` — left-pad with `fill` until `.codepoints .count >= width`.
167  .pad-left ( Str AnyInt Codepoint -> Str )
168    str-pad-left ;
169
170  # si[impl str.pad-right]
171  ## `.pad-right` — right-pad with `fill` until codepoint-count >= width.
172  .pad-right ( Str AnyInt Codepoint -> Str )
173    str-pad-right ;
174
175  # si[impl str.join]
176  ## `.join` — insert separator between vector elements and concatenate.
177  .join ( Str (Vec Str) -> Str )
178    str-join ;
179
180  # si[impl str.to-utf16]
181  ## `.to-utf16` — encode as UTF-16 code units. Returns `(Vec AnyUInt)` —
182  ## each element is a UTF-16 code unit in the range `0..65536`. The
183  ## concrete `U16` alias is not used in the signature because alias
184  ## normalisation during signature registration conflicts with the
185  ## intrinsic's existential bound on the element type.
186  .to-utf16 ( Str -> (Vec AnyUInt) )
187    str-to-utf16 ;
188
189  # si[impl str.chars]
190  ## `.chars` — enumerate the Unicode codepoints of the receiver into a
191  ## `(Vec Codepoint)`. Eager Vec complement of the spec's lazy
192  ## `.codepoints` iterator.
193  .chars ( Str -> (Vec Codepoint) )
194    str-chars ;
195
196  # si[impl str.bytes]
197  ## `.bytes` — return the UTF-8 encoding as a `Bytes` cell. Eager Bytes
198  ## form; the spec's lazy `(Iterator U8)` form lives on the core `Str`
199  ## type.
200  .bytes ( Str -> Bytes )
201    str-bytes ;
202
203  # si[impl str.as-bytes]
204  ## `.as-bytes` — zero-copy-style view of the receiver's UTF-8 encoding.
205  ## Runtime shape matches `.bytes`: a plain `Bytes` cell. The "zero-copy"
206  ## guarantee is a forward-compatibility contract; today the
207  ## implementation allocates a fresh `Bytes` over the byte view.
208  .as-bytes ( Str -> Bytes )
209    str-bytes ;
210
211  # si[impl str.char-count]
212  ## `.char-count` — count of Unicode codepoints (distinct from `.len`,
213  ## which is the UTF-8 byte count).
214  .char-count ( Str -> AnyInt )
215    str-char-count ;
216:end
217
218# si[impl str.from-utf16]
219## `str-from-utf16` — decode a `(Vec AnyUInt)` of UTF-16 code units to a
220## `Str`; invalid surrogates produce `Err`. Free word because the receiver
221## is a Vec (no Str in the input), matching the `bytes-from-hex` pattern.
222:fn(pub) str-from-utf16 ( (Vec AnyUInt) -> (Result Str Str) )
223  str-from-utf16-intrinsic
224:end
225
226# si[impl str.from-utf8-lossy]
227## `str-from-utf8-lossy` — decode bytes as UTF-8, replacing invalid sequences
228## with U+FFFD. The input is `Bytes` = `(Vec Byte)`; the signature uses
229## `(Vec AnyUInt)` because the alias-resolver does not expand `Bytes`
230## during signature registration. Free word for the same reason as
231## `str-from-utf16`.
232:fn(pub) str-from-utf8-lossy ( (Vec AnyUInt) -> Str )
233  str-from-utf8-lossy-intrinsic
234:end