Skip to content

Commit 48c0655

Browse files
decathorpedavidhewitt
authored andcommitted
Add wrapper for PyASCIIObject state bitfield accesses based on bindgen
1 parent ce288e6 commit 48c0655

File tree

5 files changed

+268
-45
lines changed

5 files changed

+268
-45
lines changed

newsfragments/3015.added.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Support `PyASCIIObject` / `PyUnicode` and associated methods on big-endian architectures.

pyo3-ffi/src/cpython/unicodeobject.rs

Lines changed: 251 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,184 @@ use std::os::raw::{c_char, c_int, c_uint, c_void};
3030
// skipped Py_UNICODE_HIGH_SURROGATE
3131
// skipped Py_UNICODE_LOW_SURROGATE
3232

33+
// generated by bindgen v0.63.0 (with small adaptations)
34+
#[repr(C)]
35+
struct BitfieldUnit<Storage> {
36+
storage: Storage,
37+
}
38+
39+
impl<Storage> BitfieldUnit<Storage> {
40+
#[inline]
41+
pub const fn new(storage: Storage) -> Self {
42+
Self { storage }
43+
}
44+
}
45+
46+
impl<Storage> BitfieldUnit<Storage>
47+
where
48+
Storage: AsRef<[u8]> + AsMut<[u8]>,
49+
{
50+
#[inline]
51+
fn get_bit(&self, index: usize) -> bool {
52+
debug_assert!(index / 8 < self.storage.as_ref().len());
53+
let byte_index = index / 8;
54+
let byte = self.storage.as_ref()[byte_index];
55+
let bit_index = if cfg!(target_endian = "big") {
56+
7 - (index % 8)
57+
} else {
58+
index % 8
59+
};
60+
let mask = 1 << bit_index;
61+
byte & mask == mask
62+
}
63+
64+
#[inline]
65+
fn set_bit(&mut self, index: usize, val: bool) {
66+
debug_assert!(index / 8 < self.storage.as_ref().len());
67+
let byte_index = index / 8;
68+
let byte = &mut self.storage.as_mut()[byte_index];
69+
let bit_index = if cfg!(target_endian = "big") {
70+
7 - (index % 8)
71+
} else {
72+
index % 8
73+
};
74+
let mask = 1 << bit_index;
75+
if val {
76+
*byte |= mask;
77+
} else {
78+
*byte &= !mask;
79+
}
80+
}
81+
82+
#[inline]
83+
fn get(&self, bit_offset: usize, bit_width: u8) -> u64 {
84+
debug_assert!(bit_width <= 64);
85+
debug_assert!(bit_offset / 8 < self.storage.as_ref().len());
86+
debug_assert!((bit_offset + (bit_width as usize)) / 8 <= self.storage.as_ref().len());
87+
let mut val = 0;
88+
for i in 0..(bit_width as usize) {
89+
if self.get_bit(i + bit_offset) {
90+
let index = if cfg!(target_endian = "big") {
91+
bit_width as usize - 1 - i
92+
} else {
93+
i
94+
};
95+
val |= 1 << index;
96+
}
97+
}
98+
val
99+
}
100+
101+
#[inline]
102+
fn set(&mut self, bit_offset: usize, bit_width: u8, val: u64) {
103+
debug_assert!(bit_width <= 64);
104+
debug_assert!(bit_offset / 8 < self.storage.as_ref().len());
105+
debug_assert!((bit_offset + (bit_width as usize)) / 8 <= self.storage.as_ref().len());
106+
for i in 0..(bit_width as usize) {
107+
let mask = 1 << i;
108+
let val_bit_is_set = val & mask == mask;
109+
let index = if cfg!(target_endian = "big") {
110+
bit_width as usize - 1 - i
111+
} else {
112+
i
113+
};
114+
self.set_bit(index + bit_offset, val_bit_is_set);
115+
}
116+
}
117+
}
118+
119+
// generated by bindgen v0.63.0 (with small adaptations)
120+
// The same code is generated for Python 3.7, 3.8, 3.9, 3.10, and 3.11, but the "ready" field
121+
// has been removed from Python 3.12.
122+
123+
/// Wrapper around the `PyASCIIObject.state` bitfield with getters and setters that work
124+
/// on most little- and big-endian architectures.
125+
///
126+
/// Memory layout of C bitfields is implementation defined, so these functions are still
127+
/// unsafe. Users must verify that they work as expected on the architectures they target.
128+
#[repr(C)]
129+
#[repr(align(4))]
130+
struct PyASCIIObjectState {
131+
_bitfield_align: [u8; 0],
132+
_bitfield: BitfieldUnit<[u8; 4usize]>,
133+
}
134+
135+
// c_uint and u32 are not necessarily the same type on all targets / architectures
136+
#[allow(clippy::useless_transmute)]
137+
impl PyASCIIObjectState {
138+
#[inline]
139+
unsafe fn interned(&self) -> c_uint {
140+
std::mem::transmute(self._bitfield.get(0usize, 2u8) as u32)
141+
}
142+
143+
#[inline]
144+
unsafe fn set_interned(&mut self, val: c_uint) {
145+
let val: u32 = std::mem::transmute(val);
146+
self._bitfield.set(0usize, 2u8, val as u64)
147+
}
148+
149+
#[inline]
150+
unsafe fn kind(&self) -> c_uint {
151+
std::mem::transmute(self._bitfield.get(2usize, 3u8) as u32)
152+
}
153+
154+
#[inline]
155+
unsafe fn set_kind(&mut self, val: c_uint) {
156+
let val: u32 = std::mem::transmute(val);
157+
self._bitfield.set(2usize, 3u8, val as u64)
158+
}
159+
160+
#[inline]
161+
unsafe fn compact(&self) -> c_uint {
162+
std::mem::transmute(self._bitfield.get(5usize, 1u8) as u32)
163+
}
164+
165+
#[inline]
166+
unsafe fn set_compact(&mut self, val: c_uint) {
167+
let val: u32 = std::mem::transmute(val);
168+
self._bitfield.set(5usize, 1u8, val as u64)
169+
}
170+
171+
#[inline]
172+
unsafe fn ascii(&self) -> c_uint {
173+
std::mem::transmute(self._bitfield.get(6usize, 1u8) as u32)
174+
}
175+
176+
#[inline]
177+
unsafe fn set_ascii(&mut self, val: c_uint) {
178+
let val: u32 = std::mem::transmute(val);
179+
self._bitfield.set(6usize, 1u8, val as u64)
180+
}
181+
182+
#[inline]
183+
unsafe fn ready(&self) -> c_uint {
184+
std::mem::transmute(self._bitfield.get(7usize, 1u8) as u32)
185+
}
186+
187+
#[inline]
188+
unsafe fn set_ready(&mut self, val: c_uint) {
189+
let val: u32 = std::mem::transmute(val);
190+
self._bitfield.set(7usize, 1u8, val as u64)
191+
}
192+
}
193+
194+
impl From<u32> for PyASCIIObjectState {
195+
#[inline]
196+
fn from(value: u32) -> Self {
197+
PyASCIIObjectState {
198+
_bitfield_align: [],
199+
_bitfield: BitfieldUnit::new(value.to_ne_bytes()),
200+
}
201+
}
202+
}
203+
204+
impl From<PyASCIIObjectState> for u32 {
205+
#[inline]
206+
fn from(value: PyASCIIObjectState) -> Self {
207+
u32::from_ne_bytes(value._bitfield.storage)
208+
}
209+
}
210+
33211
#[repr(C)]
34212
pub struct PyASCIIObject {
35213
pub ob_base: PyObject,
@@ -52,34 +230,98 @@ pub struct PyASCIIObject {
52230
}
53231

54232
/// Interacting with the bitfield is not actually well-defined, so we mark these APIs unsafe.
55-
///
56-
/// In addition, they are disabled on big-endian architectures to restrict this to most "common"
57-
/// platforms, which are at least tested on CI and appear to be sound.
58-
#[cfg(target_endian = "little")]
59233
impl PyASCIIObject {
234+
/// Get the `interned` field of the [`PyASCIIObject`] state bitfield.
235+
///
236+
/// Returns one of: [`SSTATE_NOT_INTERNED`], [`SSTATE_INTERNED_MORTAL`], [`SSTATE_INTERNED_IMMORTAL`]
60237
#[inline]
61238
pub unsafe fn interned(&self) -> c_uint {
62-
self.state & 3
239+
PyASCIIObjectState::from(self.state).interned()
63240
}
64241

242+
/// Set the `interned` field of the [`PyASCIIObject`] state bitfield.
243+
///
244+
/// Calling this function with an argument that is not [`SSTATE_NOT_INTERNED`],
245+
/// [`SSTATE_INTERNED_MORTAL`], or [`SSTATE_INTERNED_IMMORTAL`] is invalid.
246+
#[inline]
247+
pub unsafe fn set_interned(&mut self, val: c_uint) {
248+
let mut state = PyASCIIObjectState::from(self.state);
249+
state.set_interned(val);
250+
self.state = u32::from(state);
251+
}
252+
253+
/// Get the `kind` field of the [`PyASCIIObject`] state bitfield.
254+
///
255+
/// Returns one of: [`PyUnicode_WCHAR_KIND`], [`PyUnicode_1BYTE_KIND`], [`PyUnicode_2BYTE_KIND`],
256+
/// [`PyUnicode_4BYTE_KIND`]
65257
#[inline]
66258
pub unsafe fn kind(&self) -> c_uint {
67-
(self.state >> 2) & 7
259+
PyASCIIObjectState::from(self.state).kind()
68260
}
69261

262+
/// Set the `kind` field of the [`PyASCIIObject`] state bitfield.
263+
///
264+
/// Calling this function with an argument that is not [`PyUnicode_WCHAR_KIND`], [`PyUnicode_1BYTE_KIND`],
265+
/// [`PyUnicode_2BYTE_KIND`], or [`PyUnicode_4BYTE_KIND`] is invalid.
266+
#[inline]
267+
pub unsafe fn set_kind(&mut self, val: c_uint) {
268+
let mut state = PyASCIIObjectState::from(self.state);
269+
state.set_kind(val);
270+
self.state = u32::from(state);
271+
}
272+
273+
/// Get the `compact` field of the [`PyASCIIObject`] state bitfield.
274+
///
275+
/// Returns either `0` or `1`.
70276
#[inline]
71277
pub unsafe fn compact(&self) -> c_uint {
72-
(self.state >> 5) & 1
278+
PyASCIIObjectState::from(self.state).compact()
279+
}
280+
281+
/// Set the `compact` flag of the [`PyASCIIObject`] state bitfield.
282+
///
283+
/// Calling this function with an argument that is neither `0` nor `1` is invalid.
284+
#[inline]
285+
pub unsafe fn set_compact(&mut self, val: c_uint) {
286+
let mut state = PyASCIIObjectState::from(self.state);
287+
state.set_compact(val);
288+
self.state = u32::from(state);
73289
}
74290

291+
/// Get the `ascii` field of the [`PyASCIIObject`] state bitfield.
292+
///
293+
/// Returns either `0` or `1`.
75294
#[inline]
76295
pub unsafe fn ascii(&self) -> c_uint {
77-
(self.state >> 6) & 1
296+
PyASCIIObjectState::from(self.state).ascii()
78297
}
79298

299+
/// Set the `ascii` flag of the [`PyASCIIObject`] state bitfield.
300+
///
301+
/// Calling this function with an argument that is neither `0` nor `1` is invalid.
302+
#[inline]
303+
pub unsafe fn set_ascii(&mut self, val: c_uint) {
304+
let mut state = PyASCIIObjectState::from(self.state);
305+
state.set_ascii(val);
306+
self.state = u32::from(state);
307+
}
308+
309+
/// Get the `ready` field of the [`PyASCIIObject`] state bitfield.
310+
///
311+
/// Returns either `0` or `1`.
80312
#[inline]
81313
pub unsafe fn ready(&self) -> c_uint {
82-
(self.state >> 7) & 1
314+
PyASCIIObjectState::from(self.state).ready()
315+
}
316+
317+
/// Set the `ready` flag of the [`PyASCIIObject`] state bitfield.
318+
///
319+
/// Calling this function with an argument that is neither `0` nor `1` is invalid.
320+
#[inline]
321+
pub unsafe fn set_ready(&mut self, val: c_uint) {
322+
let mut state = PyASCIIObjectState::from(self.state);
323+
state.set_ready(val);
324+
self.state = u32::from(state);
83325
}
84326
}
85327

@@ -120,7 +362,6 @@ pub const SSTATE_INTERNED_MORTAL: c_uint = 1;
120362
pub const SSTATE_INTERNED_IMMORTAL: c_uint = 2;
121363

122364
#[inline]
123-
#[cfg(target_endian = "little")]
124365
pub unsafe fn PyUnicode_IS_ASCII(op: *mut PyObject) -> c_uint {
125366
debug_assert!(crate::PyUnicode_Check(op) != 0);
126367
debug_assert!(PyUnicode_IS_READY(op) != 0);
@@ -129,13 +370,11 @@ pub unsafe fn PyUnicode_IS_ASCII(op: *mut PyObject) -> c_uint {
129370
}
130371

131372
#[inline]
132-
#[cfg(target_endian = "little")]
133373
pub unsafe fn PyUnicode_IS_COMPACT(op: *mut PyObject) -> c_uint {
134374
(*(op as *mut PyASCIIObject)).compact()
135375
}
136376

137377
#[inline]
138-
#[cfg(target_endian = "little")]
139378
pub unsafe fn PyUnicode_IS_COMPACT_ASCII(op: *mut PyObject) -> c_uint {
140379
((*(op as *mut PyASCIIObject)).ascii() != 0 && PyUnicode_IS_COMPACT(op) != 0).into()
141380
}
@@ -149,25 +388,21 @@ pub const PyUnicode_2BYTE_KIND: c_uint = 2;
149388
pub const PyUnicode_4BYTE_KIND: c_uint = 4;
150389

151390
#[inline]
152-
#[cfg(target_endian = "little")]
153391
pub unsafe fn PyUnicode_1BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS1 {
154392
PyUnicode_DATA(op) as *mut Py_UCS1
155393
}
156394

157395
#[inline]
158-
#[cfg(target_endian = "little")]
159396
pub unsafe fn PyUnicode_2BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS2 {
160397
PyUnicode_DATA(op) as *mut Py_UCS2
161398
}
162399

163400
#[inline]
164-
#[cfg(target_endian = "little")]
165401
pub unsafe fn PyUnicode_4BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS4 {
166402
PyUnicode_DATA(op) as *mut Py_UCS4
167403
}
168404

169405
#[inline]
170-
#[cfg(target_endian = "little")]
171406
pub unsafe fn PyUnicode_KIND(op: *mut PyObject) -> c_uint {
172407
debug_assert!(crate::PyUnicode_Check(op) != 0);
173408
debug_assert!(PyUnicode_IS_READY(op) != 0);
@@ -176,7 +411,6 @@ pub unsafe fn PyUnicode_KIND(op: *mut PyObject) -> c_uint {
176411
}
177412

178413
#[inline]
179-
#[cfg(target_endian = "little")]
180414
pub unsafe fn _PyUnicode_COMPACT_DATA(op: *mut PyObject) -> *mut c_void {
181415
if PyUnicode_IS_ASCII(op) != 0 {
182416
(op as *mut PyASCIIObject).offset(1) as *mut c_void
@@ -186,15 +420,13 @@ pub unsafe fn _PyUnicode_COMPACT_DATA(op: *mut PyObject) -> *mut c_void {
186420
}
187421

188422
#[inline]
189-
#[cfg(target_endian = "little")]
190423
pub unsafe fn _PyUnicode_NONCOMPACT_DATA(op: *mut PyObject) -> *mut c_void {
191424
debug_assert!(!(*(op as *mut PyUnicodeObject)).data.any.is_null());
192425

193426
(*(op as *mut PyUnicodeObject)).data.any
194427
}
195428

196429
#[inline]
197-
#[cfg(target_endian = "little")]
198430
pub unsafe fn PyUnicode_DATA(op: *mut PyObject) -> *mut c_void {
199431
debug_assert!(crate::PyUnicode_Check(op) != 0);
200432

@@ -210,7 +442,6 @@ pub unsafe fn PyUnicode_DATA(op: *mut PyObject) -> *mut c_void {
210442
// skipped PyUnicode_READ_CHAR
211443

212444
#[inline]
213-
#[cfg(target_endian = "little")]
214445
pub unsafe fn PyUnicode_GET_LENGTH(op: *mut PyObject) -> Py_ssize_t {
215446
debug_assert!(crate::PyUnicode_Check(op) != 0);
216447
debug_assert!(PyUnicode_IS_READY(op) != 0);
@@ -219,15 +450,13 @@ pub unsafe fn PyUnicode_GET_LENGTH(op: *mut PyObject) -> Py_ssize_t {
219450
}
220451

221452
#[inline]
222-
#[cfg(target_endian = "little")]
223453
pub unsafe fn PyUnicode_IS_READY(op: *mut PyObject) -> c_uint {
224454
(*(op as *mut PyASCIIObject)).ready()
225455
}
226456

227457
#[cfg(not(Py_3_12))]
228458
#[cfg_attr(Py_3_10, deprecated(note = "Python 3.10"))]
229459
#[inline]
230-
#[cfg(target_endian = "little")]
231460
pub unsafe fn PyUnicode_READY(op: *mut PyObject) -> c_int {
232461
debug_assert!(crate::PyUnicode_Check(op) != 0);
233462

0 commit comments

Comments
 (0)