|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| #define PY_SSIZE_T_CLEAN |
| #include <Python.h> |
| #include <vector> |
| #include <iostream> |
| #include <cstring> |
|
|
| |
| #if defined(__x86_64__) || defined(_M_X64) |
| #ifdef _MSC_VER |
| #include <intrin.h> |
| #else |
| #include <cpuid.h> |
| #endif |
| #include <immintrin.h> |
| #define USE_AVX2 1 |
| #else |
| #define USE_AVX2 0 |
| #endif |
|
|
| |
| static bool supports_avx2() { |
| #if USE_AVX2 |
| #ifdef _MSC_VER |
| int cpu_info[4]; |
| __cpuid(cpu_info, 7); |
| return (cpu_info[1] & 0x20) != 0; |
| #else |
| unsigned int eax, ebx, ecx, edx; |
| if (__get_cpuid(7, &eax, &ebx, &ecx, &edx)) { |
| return (ebx & (1 << 5)) != 0; |
| } |
| return false; |
| #endif |
| #else |
| return false; |
| #endif |
| } |
|
|
| |
| struct DATContext { |
| const int32_t* base; |
| const int32_t* check; |
| const int32_t* values; |
| uint32_t size; |
| PyObject* buffer_ref; |
| }; |
|
|
| static DATContext ctx; |
|
|
| |
| static void get_cpu_brand(char* brand) { |
| brand[0] = '\0'; |
| #ifdef _MSC_VER |
| int regs[4]; |
| __cpuid(regs, 0x80000000); |
| if (regs[0] >= 0x80000004) { |
| __cpuid((int*)(brand), 0x80000002); |
| __cpuid((int*)(brand+16), 0x80000003); |
| __cpuid((int*)(brand+32), 0x80000004); |
| } |
| #else |
| unsigned int eax, ebx, ecx, edx; |
| if (__get_cpuid_max(0x80000000, NULL) >= 0x80000004) { |
| __get_cpuid(0x80000002, &eax, &ebx, &ecx, &edx); |
| memcpy(brand, &eax, 4); memcpy(brand+4, &ebx, 4); memcpy(brand+8, &ecx, 4); memcpy(brand+12, &edx, 4); |
| __get_cpuid(0x80000003, &eax, &ebx, &ecx, &edx); |
| memcpy(brand+16, &eax, 4); memcpy(brand+20, &ebx, 4); memcpy(brand+24, &ecx, 4); memcpy(brand+28, &edx, 4); |
| __get_cpuid(0x80000004, &eax, &ebx, &ecx, &edx); |
| memcpy(brand+32, &eax, 4); memcpy(brand+36, &ebx, 4); memcpy(brand+40, &ecx, 4); memcpy(brand+44, &edx, 4); |
| } |
| #endif |
| } |
|
|
| static PyObject* get_hardware_info(PyObject* self, PyObject* args) { |
| char brand[49] = {0}; |
| get_cpu_brand(brand); |
| |
| |
| std::string cpu_name = brand; |
| size_t last = cpu_name.find_last_not_of(' '); |
| if (last != std::string::npos) cpu_name = cpu_name.substr(0, last + 1); |
| if (cpu_name.empty()) cpu_name = "Unknown CPU"; |
|
|
| std::string features = "Standard"; |
| if (supports_avx2()) { |
| features = "AVX2"; |
| } |
| #if defined(__AVX512F__) |
| features = "AVX-512 (Nitro)"; |
| #endif |
|
|
| std::string info = cpu_name + " [" + features + "]"; |
| return PyUnicode_FromString(info.c_str()); |
| } |
|
|
| |
| |
| inline int is_ascii_32_avx2(const char* ptr) { |
| #if USE_AVX2 |
| |
| __m256i chunk = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr)); |
| |
| int mask = _mm256_movemask_epi8(chunk); |
| return mask == 0; |
| #else |
| return 0; |
| #endif |
| } |
|
|
| |
| static PyObject* tokenize(PyObject* self, PyObject* args) { |
| const char* text; |
| Py_ssize_t len; |
|
|
| |
| if (!PyArg_ParseTuple(args, "s#", &text, &len)) return NULL; |
|
|
| if (ctx.size == 0) { |
| PyErr_SetString(PyExc_RuntimeError, "Engine not loaded. Call load_dat() first."); |
| return NULL; |
| } |
|
|
| PyObject* result = PyList_New(0); |
| size_t pos = 0; |
|
|
| |
| while (pos < len) { |
| int32_t node = 0; |
| int best_token = -1; |
| int best_len = 0; |
| |
| |
| static bool avx2_supported = supports_avx2(); |
| |
| |
| bool fast_mode = false; |
| if (USE_AVX2 && avx2_supported && (len - pos) >= 32) { |
| if (is_ascii_32_avx2(text + pos)) { |
| fast_mode = true; |
| } |
| } |
|
|
| if (fast_mode) { |
| |
| |
| #pragma unroll |
| for (size_t i = pos; i < len; ++i) { |
| uint8_t c = (uint8_t)text[i]; |
| |
| |
| int32_t next = ctx.base[node] + c; |
|
|
| |
| if (next >= (int32_t)ctx.size || ctx.check[next] != node) { |
| break; |
| } |
|
|
| node = next; |
| |
| |
| int32_t val = ctx.values[node]; |
| if (val != -1) { |
| best_token = val; |
| best_len = (int)(i - pos) + 1; |
| } |
| } |
| } else { |
| |
| for (size_t i = pos; i < len; ++i) { |
| uint8_t c = (uint8_t)text[i]; |
| |
| int32_t next = ctx.base[node] + c; |
|
|
| if (next >= (int32_t)ctx.size || ctx.check[next] != node) { |
| break; |
| } |
|
|
| node = next; |
| int32_t val = ctx.values[node]; |
| if (val != -1) { |
| best_token = val; |
| best_len = (int)(i - pos) + 1; |
| } |
| } |
| } |
|
|
| |
| if (best_len > 0) { |
| PyObject* val = PyLong_FromLong(best_token); |
| PyList_Append(result, val); |
| Py_DECREF(val); |
| pos += best_len; |
| } else { |
| |
| |
| PyObject* unk = PyLong_FromLong(1); |
| PyList_Append(result, unk); |
| Py_DECREF(unk); |
| pos++; |
| } |
| } |
|
|
| return result; |
| } |
|
|
| |
| static Py_buffer ctx_buffer; |
| static bool buffer_held = false; |
|
|
| |
| |
| static PyObject* load_dat(PyObject* self, PyObject* args) { |
| PyObject* py_buffer_obj; |
| if (!PyArg_ParseTuple(args, "O", &py_buffer_obj)) return NULL; |
| |
| |
| if (buffer_held) { |
| PyBuffer_Release(&ctx_buffer); |
| buffer_held = false; |
| } |
| if (ctx.buffer_ref) { |
| Py_XDECREF(ctx.buffer_ref); |
| ctx.buffer_ref = NULL; |
| } |
|
|
| |
| if (PyObject_GetBuffer(py_buffer_obj, &ctx_buffer, PyBUF_SIMPLE) != 0) { |
| PyErr_SetString(PyExc_TypeError, "Expected buffer-like object (bytes, mmap, memoryview)"); |
| return NULL; |
| } |
| buffer_held = true; |
|
|
| |
| Py_XINCREF(py_buffer_obj); |
| ctx.buffer_ref = py_buffer_obj; |
|
|
| char* raw_ptr = static_cast<char*>(ctx_buffer.buf); |
| Py_ssize_t buf_len = ctx_buffer.len; |
| |
| |
| if (buf_len < 12) { |
| PyErr_SetString(PyExc_ValueError, "Buffer too small for DAT header"); |
| return NULL; |
| } |
| |
| |
| if (strncmp(raw_ptr, "CRAY", 4) != 0) { |
| PyErr_SetString(PyExc_ValueError, "Invalid Magic Header"); |
| return NULL; |
| } |
|
|
| |
| ctx.size = *reinterpret_cast<uint32_t*>(raw_ptr + 8); |
| |
| |
| size_t expected_size = 12 + (3 * ctx.size * sizeof(int32_t)); |
| if (static_cast<size_t>(buf_len) < expected_size) { |
| PyErr_SetString(PyExc_ValueError, "Buffer size mismatch with header"); |
| return NULL; |
| } |
|
|
| |
| char* arrays_ptr = raw_ptr + 12; |
| size_t array_bytes = ctx.size * sizeof(int32_t); |
|
|
| ctx.base = reinterpret_cast<int32_t*>(arrays_ptr); |
| ctx.check = reinterpret_cast<int32_t*>(arrays_ptr + array_bytes); |
| ctx.values = reinterpret_cast<int32_t*>(arrays_ptr + (2 * array_bytes)); |
|
|
| return PyLong_FromLong(ctx.size); |
| } |
|
|
| |
| static PyMethodDef Methods[] = { |
| {"tokenize", tokenize, METH_VARARGS, "Fast DAT Tokenize"}, |
| {"load_dat", load_dat, METH_VARARGS, "Load Memory Map"}, |
| {"get_hardware_info", get_hardware_info, METH_VARARGS, "Get CPU Telemetry"}, |
| {NULL, NULL, 0, NULL} |
| }; |
|
|
| static struct PyModuleDef module = { |
| PyModuleDef_HEAD_INIT, "crayon_cpu", "Crayon AVX2 Backend", -1, Methods |
| }; |
|
|
| PyMODINIT_FUNC PyInit_crayon_cpu(void) { |
| return PyModule_Create(&module); |
| } |
|
|