Source src/silo:std.str
1##! Stdlib `Str` methods — prefix/suffix, search, case, padding, repetition, joining, UTF conversion, trimming, splitting. 2##! 3##! Core operations (`.len`, `.bytes`, `.slice`, `+`, `split`, 4##! `contains`, `trim`, `format`) live on the `Str` lang-item and its 5##! host intrinsics; this module adds the rest directly on `:impl Str`. 6 7:use 8 :open core AnyInt AnyUInt Bool Bytes Codepoint Option None Some Pair Range Result Ok Err Str Vec 9 :open matcher CodepointMatcher 10:end 11 12:impl Str 13 # si[impl str.starts-with] 14 ## `.starts-with` — true iff the receiver starts with the prefix. 15 .starts-with ( Str Str -> Bool ) 16 str-starts-with ; 17 18 # si[impl str.ends-with] 19 ## `.ends-with` — true iff the receiver ends with the suffix. 20 .ends-with ( Str Str -> Bool ) 21 str-ends-with ; 22 23 # si[impl str.strip-prefix] 24 ## `.strip-prefix` — remove the prefix if present; `Some rest` or `None`. 25 .strip-prefix ( Str Str -> (Option Str) ) 26 str-strip-prefix ; 27 28 # si[impl str.strip-suffix] 29 ## `.strip-suffix` — remove the suffix if present; `Some rest` or `None`. 30 .strip-suffix ( Str Str -> (Option Str) ) 31 str-strip-suffix ; 32 33 # si[impl str.find+1] 34 ## `.find` — byte offset of the first occurrence of `m` in the receiver, 35 ## or `None` if `m` does not match. Accepts any `(CodepointMatcher m)`: 36 ## the built-in `Str`, `Codepoint`, `(Vec Codepoint)`, or a predicate 37 ## quotation. Delegates to `.find-in` and projects the start offset. 38 .find ( Str m -> (Option AnyInt) ) { (CodepointMatcher m) } 39 swap .find-in 40 :match 41 | None => None 42 | Some r => r :match 43 | RangeBoth s _ => s Some 44 | RangeFrom s => s Some 45 | RangeTo _ => 0 Some 46 | RangeFull => 0 Some 47 :end 48 :end ; 49 50 # si[impl str.rfind] 51 ## `.rfind` — byte offset of the last occurrence, or -1 if not found. 52 ## Still takes a literal `Str` pending a backward-scanning Matcher variant. 53 .rfind ( Str Str -> AnyInt ) 54 str-rfind ; 55 56 # si[impl string.contains+1] 57 ## `.contains` — `true` iff `m` matches anywhere in the receiver. Accepts 58 ## any `(CodepointMatcher m)`; delegates to `.find` and projects presence. 59 .contains ( Str m -> Bool ) { (CodepointMatcher m) } 60 .find .is-some ; 61 62 # si[impl str.replace] 63 ## `.replace` — replace first occurrence of target with replacement. 64 .replace ( Str Str Str -> Str ) 65 str-replace ; 66 67 # si[impl str.replace-all] 68 ## `.replace-all` — replace every non-overlapping occurrence. 69 .replace-all ( Str Str Str -> Str ) 70 str-replace-all ; 71 72 # si[impl str.to-upper] 73 ## `.to-upper` — Unicode uppercase (via Rust `std`). 74 .to-upper ( Str -> Str ) 75 str-to-upper ; 76 77 # si[impl str.to-lower] 78 ## `.to-lower` — Unicode lowercase (via Rust `std`). 79 .to-lower ( Str -> Str ) 80 str-to-lower ; 81 82 # si[impl str.to-ascii-upper] 83 ## `.to-ascii-upper` — ASCII-only uppercase; non-ASCII bytes pass through. 84 .to-ascii-upper ( Str -> Str ) 85 str-to-ascii-upper ; 86 87 # si[impl str.to-ascii-lower] 88 ## `.to-ascii-lower` — ASCII-only lowercase; non-ASCII bytes pass through. 89 .to-ascii-lower ( Str -> Str ) 90 str-to-ascii-lower ; 91 92 # si[impl str.is-ascii] 93 ## `.is-ascii` — true iff every byte is < 128. 94 .is-ascii ( Str -> Bool ) 95 str-is-ascii ; 96 97 # si[impl str.is-empty] 98 ## `.is-empty` — true iff the byte length is zero. 99 .is-empty ( Str -> Bool ) 100 str-is-empty ; 101 102 # si[impl str.repeat] 103 ## `.repeat` — concatenate the receiver N times (0 or negative yields ""). 104 .repeat ( Str AnyInt -> Str ) 105 str-repeat ; 106 107 # si[impl str.char-at] 108 ## `.char-at` — nth codepoint (0-indexed) as `(Option Codepoint)`. 109 .char-at ( Str AnyInt -> (Option Codepoint) ) 110 str-char-at-opt ; 111 112 # si[impl str.byte-at] 113 ## `.byte-at` — byte at index as `(Option AnyUInt)` (the byte value 0..256). 114 .byte-at ( Str AnyInt -> (Option AnyUInt) ) 115 str-byte-at-opt ; 116 117 # si[impl str.lines] 118 ## `.lines` — split by newline (`\n`, `\r\n`, `\r`); trailing newline does 119 ## not emit an extra empty element. 120 .lines ( Str -> (Vec Str) ) 121 str-lines ; 122 123 # si[impl str.words] 124 ## `.words` — split on Unicode whitespace, dropping empties. 125 .words ( Str -> (Vec Str) ) 126 str-words ; 127 128 # si[impl str.split] 129 ## `.split` — split by a `Str` separator into a `(Vec Str)`. Mirrors the 130 ## core `split` word with `.method` dispatch. 131 .split ( Str Str -> (Vec Str) ) 132 str-split ; 133 134 # si[impl str.split-once] 135 ## `.split-once` — split at the first occurrence of a `Str` separator, 136 ## returning `Some (Pair left right)` (separator excluded) or `None` when 137 ## the separator is absent. This is the literal-`Str` form; the full spec 138 ## signature uses a `Matcher` constraint, which is deferred until Matcher 139 ## trait rewiring lands. 140 .split-once ( Str Str -> (Option (Pair Str Str)) ) 141 str-split-once ; 142 143 # si[impl str.split-whitespace] 144 ## `.split-whitespace` — split on runs of Unicode whitespace, dropping 145 ## empties. Equivalent in content to `.words`; returned eagerly as a 146 ## `(Vec Str)` pending a lazy iterator surface for `Str` sources. 147 .split-whitespace ( Str -> (Vec Str) ) 148 str-split-whitespace ; 149 150 # si[impl str.trim] 151 ## `.trim` — trim Unicode `White_Space` from both ends. 152 .trim ( Str -> Str ) 153 str-trim-unicode ; 154 155 # si[impl str.trim-start] 156 ## `.trim-start` — trim Unicode `White_Space` from the start only. 157 .trim-start ( Str -> Str ) 158 str-trim-start ; 159 160 # si[impl str.trim-end] 161 ## `.trim-end` — trim Unicode `White_Space` from the end only. 162 .trim-end ( Str -> Str ) 163 str-trim-end ; 164 165 # si[impl str.pad-left] 166 ## `.pad-left` — left-pad with `fill` until `.codepoints .count >= width`. 167 .pad-left ( Str AnyInt Codepoint -> Str ) 168 str-pad-left ; 169 170 # si[impl str.pad-right] 171 ## `.pad-right` — right-pad with `fill` until codepoint-count >= width. 172 .pad-right ( Str AnyInt Codepoint -> Str ) 173 str-pad-right ; 174 175 # si[impl str.join] 176 ## `.join` — insert separator between vector elements and concatenate. 177 .join ( Str (Vec Str) -> Str ) 178 str-join ; 179 180 # si[impl str.to-utf16] 181 ## `.to-utf16` — encode as UTF-16 code units. Returns `(Vec AnyUInt)` — 182 ## each element is a UTF-16 code unit in the range `0..65536`. The 183 ## concrete `U16` alias is not used in the signature because alias 184 ## normalisation during signature registration conflicts with the 185 ## intrinsic's existential bound on the element type. 186 .to-utf16 ( Str -> (Vec AnyUInt) ) 187 str-to-utf16 ; 188 189 # si[impl str.chars] 190 ## `.chars` — enumerate the Unicode codepoints of the receiver into a 191 ## `(Vec Codepoint)`. Eager Vec complement of the spec's lazy 192 ## `.codepoints` iterator. 193 .chars ( Str -> (Vec Codepoint) ) 194 str-chars ; 195 196 # si[impl str.bytes] 197 ## `.bytes` — return the UTF-8 encoding as a `Bytes` cell. Eager Bytes 198 ## form; the spec's lazy `(Iterator U8)` form lives on the core `Str` 199 ## type. 200 .bytes ( Str -> Bytes ) 201 str-bytes ; 202 203 # si[impl str.as-bytes] 204 ## `.as-bytes` — zero-copy-style view of the receiver's UTF-8 encoding. 205 ## Runtime shape matches `.bytes`: a plain `Bytes` cell. The "zero-copy" 206 ## guarantee is a forward-compatibility contract; today the 207 ## implementation allocates a fresh `Bytes` over the byte view. 208 .as-bytes ( Str -> Bytes ) 209 str-bytes ; 210 211 # si[impl str.char-count] 212 ## `.char-count` — count of Unicode codepoints (distinct from `.len`, 213 ## which is the UTF-8 byte count). 214 .char-count ( Str -> AnyInt ) 215 str-char-count ; 216:end 217 218# si[impl str.from-utf16] 219## `str-from-utf16` — decode a `(Vec AnyUInt)` of UTF-16 code units to a 220## `Str`; invalid surrogates produce `Err`. Free word because the receiver 221## is a Vec (no Str in the input), matching the `bytes-from-hex` pattern. 222:fn(pub) str-from-utf16 ( (Vec AnyUInt) -> (Result Str Str) ) 223 str-from-utf16-intrinsic 224:end 225 226# si[impl str.from-utf8-lossy] 227## `str-from-utf8-lossy` — decode bytes as UTF-8, replacing invalid sequences 228## with U+FFFD. The input is `Bytes` = `(Vec Byte)`; the signature uses 229## `(Vec AnyUInt)` because the alias-resolver does not expand `Bytes` 230## during signature registration. Free word for the same reason as 231## `str-from-utf16`. 232:fn(pub) str-from-utf8-lossy ( (Vec AnyUInt) -> Str ) 233 str-from-utf8-lossy-intrinsic 234:end