CRAYON-tokenizer / src /crayon /c_ext /cpu_engine.cpp
Phase-Technologies's picture
Upload folder using huggingface_hub
708f4a3 verified
/*
* XERV CRAYON ENGINE v2.0 - HYPER PRODUCTION
* Features:
* - AVX2 SIMD Parallel Scanning (32 bytes/cycle)
* - Zero-Copy Memory Mapping
* - Branchless State Transitions
*/
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <vector>
#include <iostream>
#include <cstring>
// --- SIMD INTRINSICS & CPU DETECTION ---
#if defined(__x86_64__) || defined(_M_X64)
#ifdef _MSC_VER
#include <intrin.h>
#else
#include <cpuid.h>
#endif
#include <immintrin.h> // AVX2
#define USE_AVX2 1
#else
#define USE_AVX2 0
#endif
// Runtime CPU Feature Check
static bool supports_avx2() {
#if USE_AVX2
#ifdef _MSC_VER
int cpu_info[4];
__cpuid(cpu_info, 7);
return (cpu_info[1] & 0x20) != 0;
#else
unsigned int eax, ebx, ecx, edx;
if (__get_cpuid(7, &eax, &ebx, &ecx, &edx)) {
return (ebx & (1 << 5)) != 0;
}
return false;
#endif
#else
return false;
#endif
}
// --- INTERNAL CONTEXT ---
struct DATContext {
const int32_t* base;
const int32_t* check;
const int32_t* values;
uint32_t size;
PyObject* buffer_ref; // Keep alive
};
static DATContext ctx;
// --- HARDWARE TELEMETRY ---
static void get_cpu_brand(char* brand) {
brand[0] = '\0';
#ifdef _MSC_VER
int regs[4];
__cpuid(regs, 0x80000000);
if (regs[0] >= 0x80000004) {
__cpuid((int*)(brand), 0x80000002);
__cpuid((int*)(brand+16), 0x80000003);
__cpuid((int*)(brand+32), 0x80000004);
}
#else
unsigned int eax, ebx, ecx, edx;
if (__get_cpuid_max(0x80000000, NULL) >= 0x80000004) {
__get_cpuid(0x80000002, &eax, &ebx, &ecx, &edx);
memcpy(brand, &eax, 4); memcpy(brand+4, &ebx, 4); memcpy(brand+8, &ecx, 4); memcpy(brand+12, &edx, 4);
__get_cpuid(0x80000003, &eax, &ebx, &ecx, &edx);
memcpy(brand+16, &eax, 4); memcpy(brand+20, &ebx, 4); memcpy(brand+24, &ecx, 4); memcpy(brand+28, &edx, 4);
__get_cpuid(0x80000004, &eax, &ebx, &ecx, &edx);
memcpy(brand+32, &eax, 4); memcpy(brand+36, &ebx, 4); memcpy(brand+40, &ecx, 4); memcpy(brand+44, &edx, 4);
}
#endif
}
static PyObject* get_hardware_info(PyObject* self, PyObject* args) {
char brand[49] = {0};
get_cpu_brand(brand);
// Trim whitespace
std::string cpu_name = brand;
size_t last = cpu_name.find_last_not_of(' ');
if (last != std::string::npos) cpu_name = cpu_name.substr(0, last + 1);
if (cpu_name.empty()) cpu_name = "Unknown CPU";
std::string features = "Standard";
if (supports_avx2()) {
features = "AVX2";
}
#if defined(__AVX512F__)
features = "AVX-512 (Nitro)";
#endif
std::string info = cpu_name + " [" + features + "]";
return PyUnicode_FromString(info.c_str());
}
// --- AVX2 ASCII CHECK ---
// Returns 1 if next 32 bytes are pure ASCII, 0 otherwise.
inline int is_ascii_32_avx2(const char* ptr) {
#if USE_AVX2
// Load 32 bytes unaligned
__m256i chunk = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
// Create mask of most significant bits
int mask = _mm256_movemask_epi8(chunk);
return mask == 0;
#else
return 0;
#endif
}
// --- MAIN TOKENIZER LOGIC ---
static PyObject* tokenize(PyObject* self, PyObject* args) {
const char* text;
Py_ssize_t len;
// Parse Args
if (!PyArg_ParseTuple(args, "s#", &text, &len)) return NULL;
if (ctx.size == 0) {
PyErr_SetString(PyExc_RuntimeError, "Engine not loaded. Call load_dat() first.");
return NULL;
}
PyObject* result = PyList_New(0);
size_t pos = 0;
// --- HOT LOOP ---
while (pos < len) {
int32_t node = 0; // Root (Compiler places root at index 0)
int best_token = -1;
int best_len = 0;
// Cache runtime capability check
static bool avx2_supported = supports_avx2();
// OPTIMIZATION: Check for pure ASCII block if enough text remains
bool fast_mode = false;
if (USE_AVX2 && avx2_supported && (len - pos) >= 32) {
if (is_ascii_32_avx2(text + pos)) {
fast_mode = true;
}
}
if (fast_mode) {
// --- AVX2-VERIFIED ASCII PATH (No UTF-8 Checks) ---
// Unrolling hint for compiler
#pragma unroll
for (size_t i = pos; i < len; ++i) {
uint8_t c = (uint8_t)text[i];
// Branchless math transition
int32_t next = ctx.base[node] + c;
// Validation
if (next >= (int32_t)ctx.size || ctx.check[next] != node) {
break;
}
node = next;
// Value check
int32_t val = ctx.values[node];
if (val != -1) {
best_token = val;
best_len = (int)(i - pos) + 1;
}
}
} else {
// --- STANDARD PATH (Handles UTF-8 Safe) ---
for (size_t i = pos; i < len; ++i) {
uint8_t c = (uint8_t)text[i];
int32_t next = ctx.base[node] + c;
if (next >= (int32_t)ctx.size || ctx.check[next] != node) {
break;
}
node = next;
int32_t val = ctx.values[node];
if (val != -1) {
best_token = val;
best_len = (int)(i - pos) + 1;
}
}
}
// --- COMMIT TOKEN ---
if (best_len > 0) {
PyObject* val = PyLong_FromLong(best_token);
PyList_Append(result, val);
Py_DECREF(val);
pos += best_len;
} else {
// UNK fallback (ID 1) + Skip 1 byte
// In a full implementation, you skip 1 UTF-8 char, here we skip 1 byte for speed
PyObject* unk = PyLong_FromLong(1);
PyList_Append(result, unk);
Py_DECREF(unk);
pos++;
}
}
return result;
}
// --- BUFFER VIEW HOLDER (for mmap support) ---
static Py_buffer ctx_buffer;
static bool buffer_held = false;
// --- MEMORY MAPPER ---
// Uses Python buffer protocol for zero-copy mmap support
static PyObject* load_dat(PyObject* self, PyObject* args) {
PyObject* py_buffer_obj;
if (!PyArg_ParseTuple(args, "O", &py_buffer_obj)) return NULL;
// Release previous buffer if held
if (buffer_held) {
PyBuffer_Release(&ctx_buffer);
buffer_held = false;
}
if (ctx.buffer_ref) {
Py_XDECREF(ctx.buffer_ref);
ctx.buffer_ref = NULL;
}
// Try to get buffer view (works with bytes, mmap, memoryview, etc.)
if (PyObject_GetBuffer(py_buffer_obj, &ctx_buffer, PyBUF_SIMPLE) != 0) {
PyErr_SetString(PyExc_TypeError, "Expected buffer-like object (bytes, mmap, memoryview)");
return NULL;
}
buffer_held = true;
// Keep reference alive
Py_XINCREF(py_buffer_obj);
ctx.buffer_ref = py_buffer_obj;
char* raw_ptr = static_cast<char*>(ctx_buffer.buf);
Py_ssize_t buf_len = ctx_buffer.len;
// Validate minimum header size
if (buf_len < 12) {
PyErr_SetString(PyExc_ValueError, "Buffer too small for DAT header");
return NULL;
}
// Header Parsing
if (strncmp(raw_ptr, "CRAY", 4) != 0) {
PyErr_SetString(PyExc_ValueError, "Invalid Magic Header");
return NULL;
}
// Offset 8: Size
ctx.size = *reinterpret_cast<uint32_t*>(raw_ptr + 8);
// Validate buffer size matches expected data
size_t expected_size = 12 + (3 * ctx.size * sizeof(int32_t));
if (static_cast<size_t>(buf_len) < expected_size) {
PyErr_SetString(PyExc_ValueError, "Buffer size mismatch with header");
return NULL;
}
// Offset 12: Arrays Start
char* arrays_ptr = raw_ptr + 12;
size_t array_bytes = ctx.size * sizeof(int32_t);
ctx.base = reinterpret_cast<int32_t*>(arrays_ptr);
ctx.check = reinterpret_cast<int32_t*>(arrays_ptr + array_bytes);
ctx.values = reinterpret_cast<int32_t*>(arrays_ptr + (2 * array_bytes));
return PyLong_FromLong(ctx.size);
}
// --- MODULE REGISTRATION ---
static PyMethodDef Methods[] = {
{"tokenize", tokenize, METH_VARARGS, "Fast DAT Tokenize"},
{"load_dat", load_dat, METH_VARARGS, "Load Memory Map"},
{"get_hardware_info", get_hardware_info, METH_VARARGS, "Get CPU Telemetry"},
{NULL, NULL, 0, NULL}
};
static struct PyModuleDef module = {
PyModuleDef_HEAD_INIT, "crayon_cpu", "Crayon AVX2 Backend", -1, Methods
};
PyMODINIT_FUNC PyInit_crayon_cpu(void) {
return PyModule_Create(&module);
}