/* * XERV CRAYON ENGINE v2.0 - HYPER PRODUCTION * Features: * - AVX2 SIMD Parallel Scanning (32 bytes/cycle) * - Zero-Copy Memory Mapping * - Branchless State Transitions */ #define PY_SSIZE_T_CLEAN #include #include #include #include // --- SIMD INTRINSICS & CPU DETECTION --- #if defined(__x86_64__) || defined(_M_X64) #ifdef _MSC_VER #include #else #include #endif #include // AVX2 #define USE_AVX2 1 #else #define USE_AVX2 0 #endif // Runtime CPU Feature Check static bool supports_avx2() { #if USE_AVX2 #ifdef _MSC_VER int cpu_info[4]; __cpuid(cpu_info, 7); return (cpu_info[1] & 0x20) != 0; #else unsigned int eax, ebx, ecx, edx; if (__get_cpuid(7, &eax, &ebx, &ecx, &edx)) { return (ebx & (1 << 5)) != 0; } return false; #endif #else return false; #endif } // --- INTERNAL CONTEXT --- struct DATContext { const int32_t* base; const int32_t* check; const int32_t* values; uint32_t size; PyObject* buffer_ref; // Keep alive }; static DATContext ctx; // --- HARDWARE TELEMETRY --- static void get_cpu_brand(char* brand) { brand[0] = '\0'; #ifdef _MSC_VER int regs[4]; __cpuid(regs, 0x80000000); if (regs[0] >= 0x80000004) { __cpuid((int*)(brand), 0x80000002); __cpuid((int*)(brand+16), 0x80000003); __cpuid((int*)(brand+32), 0x80000004); } #else unsigned int eax, ebx, ecx, edx; if (__get_cpuid_max(0x80000000, NULL) >= 0x80000004) { __get_cpuid(0x80000002, &eax, &ebx, &ecx, &edx); memcpy(brand, &eax, 4); memcpy(brand+4, &ebx, 4); memcpy(brand+8, &ecx, 4); memcpy(brand+12, &edx, 4); __get_cpuid(0x80000003, &eax, &ebx, &ecx, &edx); memcpy(brand+16, &eax, 4); memcpy(brand+20, &ebx, 4); memcpy(brand+24, &ecx, 4); memcpy(brand+28, &edx, 4); __get_cpuid(0x80000004, &eax, &ebx, &ecx, &edx); memcpy(brand+32, &eax, 4); memcpy(brand+36, &ebx, 4); memcpy(brand+40, &ecx, 4); memcpy(brand+44, &edx, 4); } #endif } static PyObject* get_hardware_info(PyObject* self, PyObject* args) { char brand[49] = {0}; get_cpu_brand(brand); // Trim whitespace std::string cpu_name = brand; size_t last = cpu_name.find_last_not_of(' '); if (last != std::string::npos) cpu_name = cpu_name.substr(0, last + 1); if (cpu_name.empty()) cpu_name = "Unknown CPU"; std::string features = "Standard"; if (supports_avx2()) { features = "AVX2"; } #if defined(__AVX512F__) features = "AVX-512 (Nitro)"; #endif std::string info = cpu_name + " [" + features + "]"; return PyUnicode_FromString(info.c_str()); } // --- AVX2 ASCII CHECK --- // Returns 1 if next 32 bytes are pure ASCII, 0 otherwise. inline int is_ascii_32_avx2(const char* ptr) { #if USE_AVX2 // Load 32 bytes unaligned __m256i chunk = _mm256_loadu_si256(reinterpret_cast(ptr)); // Create mask of most significant bits int mask = _mm256_movemask_epi8(chunk); return mask == 0; #else return 0; #endif } // --- MAIN TOKENIZER LOGIC --- static PyObject* tokenize(PyObject* self, PyObject* args) { const char* text; Py_ssize_t len; // Parse Args if (!PyArg_ParseTuple(args, "s#", &text, &len)) return NULL; if (ctx.size == 0) { PyErr_SetString(PyExc_RuntimeError, "Engine not loaded. Call load_dat() first."); return NULL; } PyObject* result = PyList_New(0); size_t pos = 0; // --- HOT LOOP --- while (pos < len) { int32_t node = 0; // Root (Compiler places root at index 0) int best_token = -1; int best_len = 0; // Cache runtime capability check static bool avx2_supported = supports_avx2(); // OPTIMIZATION: Check for pure ASCII block if enough text remains bool fast_mode = false; if (USE_AVX2 && avx2_supported && (len - pos) >= 32) { if (is_ascii_32_avx2(text + pos)) { fast_mode = true; } } if (fast_mode) { // --- AVX2-VERIFIED ASCII PATH (No UTF-8 Checks) --- // Unrolling hint for compiler #pragma unroll for (size_t i = pos; i < len; ++i) { uint8_t c = (uint8_t)text[i]; // Branchless math transition int32_t next = ctx.base[node] + c; // Validation if (next >= (int32_t)ctx.size || ctx.check[next] != node) { break; } node = next; // Value check int32_t val = ctx.values[node]; if (val != -1) { best_token = val; best_len = (int)(i - pos) + 1; } } } else { // --- STANDARD PATH (Handles UTF-8 Safe) --- for (size_t i = pos; i < len; ++i) { uint8_t c = (uint8_t)text[i]; int32_t next = ctx.base[node] + c; if (next >= (int32_t)ctx.size || ctx.check[next] != node) { break; } node = next; int32_t val = ctx.values[node]; if (val != -1) { best_token = val; best_len = (int)(i - pos) + 1; } } } // --- COMMIT TOKEN --- if (best_len > 0) { PyObject* val = PyLong_FromLong(best_token); PyList_Append(result, val); Py_DECREF(val); pos += best_len; } else { // UNK fallback (ID 1) + Skip 1 byte // In a full implementation, you skip 1 UTF-8 char, here we skip 1 byte for speed PyObject* unk = PyLong_FromLong(1); PyList_Append(result, unk); Py_DECREF(unk); pos++; } } return result; } // --- BUFFER VIEW HOLDER (for mmap support) --- static Py_buffer ctx_buffer; static bool buffer_held = false; // --- MEMORY MAPPER --- // Uses Python buffer protocol for zero-copy mmap support static PyObject* load_dat(PyObject* self, PyObject* args) { PyObject* py_buffer_obj; if (!PyArg_ParseTuple(args, "O", &py_buffer_obj)) return NULL; // Release previous buffer if held if (buffer_held) { PyBuffer_Release(&ctx_buffer); buffer_held = false; } if (ctx.buffer_ref) { Py_XDECREF(ctx.buffer_ref); ctx.buffer_ref = NULL; } // Try to get buffer view (works with bytes, mmap, memoryview, etc.) if (PyObject_GetBuffer(py_buffer_obj, &ctx_buffer, PyBUF_SIMPLE) != 0) { PyErr_SetString(PyExc_TypeError, "Expected buffer-like object (bytes, mmap, memoryview)"); return NULL; } buffer_held = true; // Keep reference alive Py_XINCREF(py_buffer_obj); ctx.buffer_ref = py_buffer_obj; char* raw_ptr = static_cast(ctx_buffer.buf); Py_ssize_t buf_len = ctx_buffer.len; // Validate minimum header size if (buf_len < 12) { PyErr_SetString(PyExc_ValueError, "Buffer too small for DAT header"); return NULL; } // Header Parsing if (strncmp(raw_ptr, "CRAY", 4) != 0) { PyErr_SetString(PyExc_ValueError, "Invalid Magic Header"); return NULL; } // Offset 8: Size ctx.size = *reinterpret_cast(raw_ptr + 8); // Validate buffer size matches expected data size_t expected_size = 12 + (3 * ctx.size * sizeof(int32_t)); if (static_cast(buf_len) < expected_size) { PyErr_SetString(PyExc_ValueError, "Buffer size mismatch with header"); return NULL; } // Offset 12: Arrays Start char* arrays_ptr = raw_ptr + 12; size_t array_bytes = ctx.size * sizeof(int32_t); ctx.base = reinterpret_cast(arrays_ptr); ctx.check = reinterpret_cast(arrays_ptr + array_bytes); ctx.values = reinterpret_cast(arrays_ptr + (2 * array_bytes)); return PyLong_FromLong(ctx.size); } // --- MODULE REGISTRATION --- static PyMethodDef Methods[] = { {"tokenize", tokenize, METH_VARARGS, "Fast DAT Tokenize"}, {"load_dat", load_dat, METH_VARARGS, "Load Memory Map"}, {"get_hardware_info", get_hardware_info, METH_VARARGS, "Get CPU Telemetry"}, {NULL, NULL, 0, NULL} }; static struct PyModuleDef module = { PyModuleDef_HEAD_INIT, "crayon_cpu", "Crayon AVX2 Backend", -1, Methods }; PyMODINIT_FUNC PyInit_crayon_cpu(void) { return PyModule_Create(&module); }