artichoke_backend/extn/core/string/
mod.rs

1use core::ops::Deref;
2use std::ffi::{c_char, c_void};
3use std::ptr::NonNull;
4
5use artichoke_core::value::Value as _;
6use spinoso_exception::TypeError;
7#[doc(inline)]
8pub use spinoso_string::{Encoding, RawParts, String};
9
10use crate::Artichoke;
11use crate::convert::{BoxUnboxVmValue, UnboxedValueGuard};
12use crate::error::Error;
13use crate::sys;
14use crate::types::Ruby;
15use crate::value::Value;
16
17mod ffi;
18pub(in crate::extn) mod mruby;
19pub(super) mod trampoline;
20
21const ENCODING_FLAG_BITPOS: usize = 5;
22
23impl BoxUnboxVmValue for String {
24    type Unboxed = Self;
25    type Guarded = String;
26
27    const RUBY_TYPE: &'static str = "String";
28
29    #[expect(
30        clippy::cast_possible_truncation,
31        clippy::cast_sign_loss,
32        reason = "mruby stores sizes as int64_t instead of size_t"
33    )]
34    unsafe fn unbox_from_value<'a>(
35        value: &'a mut Value,
36        interp: &mut Artichoke,
37    ) -> Result<UnboxedValueGuard<'a, Self::Guarded>, Error> {
38        let _ = interp;
39
40        // Make sure we have a String otherwise extraction will fail.
41        // This check is critical to the safety of accessing the `value` union.
42        if value.ruby_type() != Ruby::String {
43            let mut message = std::string::String::from("uninitialized ");
44            message.push_str(Self::RUBY_TYPE);
45            return Err(TypeError::from(message).into());
46        }
47
48        let value = value.inner();
49        // SAFETY: The above check on the data type ensures the `value` union
50        // holds an `RString*` in the `p` variant.
51        let string = sys::mrb_sys_basic_ptr(value).cast::<sys::RString>();
52
53        let Some(ptr) = NonNull::<c_char>::new((*string).as_.heap.ptr) else {
54            // An allocated but uninitialized string has a null pointer, so swap in an empty string.
55            return Ok(UnboxedValueGuard::new(String::new()));
56        };
57        let length = (*string).as_.heap.len as usize;
58        let capacity = (*string).as_.heap.aux.capa as usize;
59
60        // the encoding flag is 4 bits wide.
61        let flags = string.as_ref().unwrap().flags();
62        let encoding_flag = flags & (0b1111 << ENCODING_FLAG_BITPOS);
63        let encoding = (encoding_flag >> ENCODING_FLAG_BITPOS) as u8;
64        let encoding = Encoding::try_from_flag(encoding).map_err(|_| TypeError::with_message("Unknown encoding"))?;
65
66        let s = String::from_raw_parts_with_encoding(
67            RawParts {
68                ptr: ptr.cast::<u8>().as_mut(),
69                length,
70                capacity,
71            },
72            encoding,
73        );
74        Ok(UnboxedValueGuard::new(s))
75    }
76
77    #[expect(
78        clippy::cast_possible_wrap,
79        reason = "mruby stores sizes as int64_t instead of size_t"
80    )]
81    fn alloc_value(value: Self::Unboxed, interp: &mut Artichoke) -> Result<Value, Error> {
82        let encoding = value.encoding();
83        let RawParts { ptr, length, capacity } = String::into_raw_parts(value);
84        let value = unsafe {
85            interp.with_ffi_boundary(|mrb| {
86                sys::mrb_sys_alloc_rstring(
87                    mrb,
88                    ptr.cast::<c_char>(),
89                    length as sys::mrb_int,
90                    capacity as sys::mrb_int,
91                )
92            })?
93        };
94        let string = unsafe { sys::mrb_sys_basic_ptr(value).cast::<sys::RString>() };
95        unsafe {
96            let flags = string.as_ref().unwrap().flags();
97            let encoding_bits = encoding.to_flag();
98            let flags_with_zeroed_encoding = flags & !(0b1111 << ENCODING_FLAG_BITPOS);
99            let flags_with_encoding = flags_with_zeroed_encoding | (u32::from(encoding_bits) << ENCODING_FLAG_BITPOS);
100            string.as_mut().unwrap().set_flags(flags_with_encoding);
101        }
102        Ok(interp.protect(value.into()))
103    }
104
105    #[expect(
106        clippy::cast_possible_wrap,
107        reason = "mruby stores sizes as int64_t instead of size_t"
108    )]
109    fn box_into_value(value: Self::Unboxed, into: Value, interp: &mut Artichoke) -> Result<Value, Error> {
110        // Make sure we have an String otherwise boxing will produce undefined
111        // behavior.
112        //
113        // This check is critical to the memory safety of future runs of the
114        // garbage collector.
115        assert_eq!(
116            into.ruby_type(),
117            Ruby::String,
118            "Tried to box String into {:?} value",
119            into.ruby_type()
120        );
121
122        let encoding = value.encoding();
123        let RawParts { ptr, length, capacity } = String::into_raw_parts(value);
124        let string = unsafe {
125            sys::mrb_sys_repack_into_rstring(
126                ptr.cast::<c_char>(),
127                length as sys::mrb_int,
128                capacity as sys::mrb_int,
129                into.inner(),
130            )
131        };
132        unsafe {
133            let flags = string.as_ref().unwrap().flags();
134            let encoding_bits = encoding.to_flag();
135            let flags_with_zeroed_encoding = flags & !(0b1111 << ENCODING_FLAG_BITPOS);
136            let flags_with_encoding = flags_with_zeroed_encoding | (u32::from(encoding_bits) << ENCODING_FLAG_BITPOS);
137            string.as_mut().unwrap().set_flags(flags_with_encoding);
138        }
139
140        Ok(interp.protect(into))
141    }
142
143    fn free(data: *mut c_void) {
144        // this function is never called. `String` is freed directly in the VM
145        // by calling `mrb_gc_free_str` which is defined in
146        // `extn/core/string/ffi.rs`.
147        //
148        // `String` should not have a destructor registered in the class
149        // registry.
150        let _ = data;
151        unreachable!("<String as BoxUnboxVmValue>::free is never called");
152    }
153}
154
155impl Deref for UnboxedValueGuard<'_, String> {
156    type Target = String;
157
158    fn deref(&self) -> &Self::Target {
159        self.as_inner_ref()
160    }
161}
162
163#[cfg(test)]
164mod tests {
165    use crate::test::prelude::*;
166
167    const SUBJECT: &str = "String";
168    #[cfg(feature = "core-regexp")]
169    const FUNCTIONAL_TEST: &[u8] = include_bytes!("string_functional_test.rb");
170
171    #[test]
172    #[cfg(feature = "core-regexp")]
173    fn functional() {
174        let mut interp = interpreter();
175        let result = interp.eval(FUNCTIONAL_TEST);
176        unwrap_or_panic_with_backtrace(&mut interp, SUBJECT, result);
177        let result = interp.eval(b"spec");
178        unwrap_or_panic_with_backtrace(&mut interp, SUBJECT, result);
179    }
180
181    #[test]
182    fn modifying_and_repacking_encoding_zeroes_old_encoding_flags() {
183        let mut interp = interpreter();
184        // Modify the encoding of a binary string in place to be UTF-8 by
185        // pushing a UTF-8 string into an empty binary string.
186        //
187        // Test for the newly taken UTF-8 encoding by ensuring that the char
188        // length of the string is 1.
189        let test = "be = ''.b ; be << '😀' ; raise 'unexpected encoding' unless be.length == 1";
190        let result = interp.eval(test.as_bytes());
191        unwrap_or_panic_with_backtrace(&mut interp, SUBJECT, result);
192    }
193
194    #[test]
195    #[cfg(feature = "core-regexp")]
196    fn start_with_regex() {
197        let mut interp = interpreter();
198        // Test that regexp matching using `start_with?` clear the relevant
199        // regexp globals This is not tested in the vendored MRI version hence
200        // why it is tested here
201        let test = r"
202            raise 'start_with? gives incorrect result' unless 'abcd test-123'.start_with?(/test-(\d+)/) == false;
203            raise 'start_with? should clear Regexp.last_match' unless Regexp.last_match == nil
204            raise 'start_with? should clear $1' unless $1 == nil
205        ";
206        let result = interp.eval(test.as_bytes());
207        unwrap_or_panic_with_backtrace(&mut interp, SUBJECT, result);
208    }
209
210    #[test]
211    fn allocated_but_uninitialized_string_can_be_garbage_collected() {
212        let mut interp = interpreter();
213        let test = r"
214            1_000_000.times do
215              String.allocate
216            end
217        ";
218        let result = interp.eval(test.as_bytes());
219        unwrap_or_panic_with_backtrace(&mut interp, SUBJECT, result);
220        interp.full_gc().unwrap();
221    }
222
223    #[test]
224    fn allocated_but_uninitialized_string_can_be_read() {
225        let mut interp = interpreter();
226        // See the ruby specs for `String.allocate` for more details:
227        // `spec-runner/vendor/spec/core/string/allocate_spec.rb`
228        //
229        // ```console
230        // [3.3.6] > s = String.allocate
231        // => ""
232        // [3.3.6] > s.empty?
233        // => true
234        // [3.3.6] > s.size == 0
235        // => true
236        // [3.3.6] > s.inspect.is_a? String
237        // => true
238        // [3.3.6] > s.inspect == '""'
239        // => true
240        // ```
241        let test = r#"
242            s = String.allocate
243            raise 'String.allocate is not an instance of String' unless s.is_a?(String)
244            raise 'String.allocate.inspect is not a String' unless s.inspect.is_a?(String)
245            raise 'String.allocate is not empty' unless s.empty?
246            raise 'String.allocate.size is not 0' unless s.size == 0
247            raise 'String.allocate.inspect is not empty' unless s.inspect == '""'
248        "#;
249        let result = interp.eval(test.as_bytes());
250        unwrap_or_panic_with_backtrace(&mut interp, SUBJECT, result);
251    }
252
253    #[test]
254    fn string_allocate_can_be_modified() {
255        let mut interp = interpreter();
256        // ```console
257        // [3.3.6] > s = String.allocate
258        // => ""
259        // [3.3.6] > s.empty?
260        // => true
261        // [3.3.6] > s.size == 0
262        // => true
263        // [3.3.6] > s.inspect.is_a? String
264        // => true
265        // [3.3.6] > s.inspect == '""'
266        // => true
267        // ```
268        let test = r"
269            s = String.allocate
270            s << 'hello'
271            s << 'world'
272            raise 'String.allocate was not grown to correct size' unless s.size == 10
273            raise 'String.allocate was not appendable' unless s == 'helloworld'
274        ";
275        let result = interp.eval(test.as_bytes());
276        unwrap_or_panic_with_backtrace(&mut interp, SUBJECT, result);
277    }
278
279    #[test]
280    #[should_panic = "String.allocate.encoding is not binary"]
281    fn freshly_allocated_string_has_binary_encoding() {
282        let mut interp = interpreter();
283        // ```console
284        // $ irb
285        // [3.3.6] > s = String.new
286        // => ""
287        // [3.3.6] > s.encoding == Encoding::BINARY
288        // => true
289        // [3.3.6] > s << "abc"
290        // => "abc"
291        // [3.3.6] > s.encoding == Encoding::UTF_8
292        // => false
293        // [3.3.6] > s.encoding
294        // => #<Encoding:ASCII-8BIT>
295        // [3.3.6] > s << "❤️"
296        // => "abc❤️"
297        // [3.3.6] > s.encoding == Encoding::UTF_8
298        // ```
299        let test = r#"
300            s = String.new
301            raise 'String.allocate.encoding is not binary' unless s.encoding == Encoding::BINARY
302            s << "abc"
303            raise 'String.allocate.encoding is not binary after appending ASCII' unless s.encoding == Encoding::BINARY
304            s << "❤️"
305            raise 'String.allocate.encoding is not UTF-8 after appending UTF-8' unless s.encoding == Encoding::UTF_8
306        "#;
307        let result = interp.eval(test.as_bytes());
308        unwrap_or_panic_with_backtrace(&mut interp, SUBJECT, result);
309    }
310
311    #[test]
312    fn reinitializing_a_frozen_string_with_no_args_is_permitted() {
313        let mut interp = interpreter();
314        let test = r"
315            raise 'reinitializing empty frozen string failed' unless String.new.freeze.send(:initialize) == ''
316            raise 'reinitializing non-empty frozen string failed' unless String.new('hello').freeze.send(:initialize) == 'hello'
317        ";
318        let result = interp.eval(test.as_bytes());
319        unwrap_or_panic_with_backtrace(&mut interp, SUBJECT, result);
320    }
321
322    #[test]
323    fn reinitializing_a_frozen_string_with_args_raises_frozen_error() {
324        let mut interp = interpreter();
325        let test = r"
326            begin
327                String.new.freeze.send(:initialize, 'world')
328            rescue FrozenError
329                # expected
330            else
331                raise 'reinitializing frozen empty string with args did not raise FrozenError'
332            end
333
334            begin
335                String.new('hello').freeze.send(:initialize, 'world')
336            rescue FrozenError
337                # expected
338            else
339                raise 'reinitializing frozen non-empty string with args did not raise FrozenError'
340            end
341        ";
342        let result = interp.eval(test.as_bytes());
343        unwrap_or_panic_with_backtrace(&mut interp, SUBJECT, result);
344    }
345
346    #[test]
347    fn reinitializing_a_string_with_no_args_is_a_noop() {
348        let mut interp = interpreter();
349        let test = "
350            s = String.new
351            s.send(:initialize)
352            raise 'reinitializing empty string failed' unless s == ''
353
354            s = String.new('hello')
355            s.send(:initialize)
356            raise 'reinitializing non-empty string failed' unless s == 'hello'
357        ";
358        let result = interp.eval(test.as_bytes());
359        unwrap_or_panic_with_backtrace(&mut interp, SUBJECT, result);
360    }
361
362    #[test]
363    fn reinitializing_a_string_with_args_replaces_the_string_contents() {
364        let mut interp = interpreter();
365        let test = "
366            s = String.new
367            s.send(:initialize, 'world')
368            raise 'reinitializing empty string with args failed' unless s == 'world'
369
370            s = String.new('hello')
371            s.send(:initialize, 'world')
372            raise 'reinitializing non-empty string with args failed' unless s == 'world'
373        ";
374        let result = interp.eval(test.as_bytes());
375        unwrap_or_panic_with_backtrace(&mut interp, SUBJECT, result);
376    }
377}