programmersd
/

movie_nerd

Model card Files Files and versions

movie_nerd / extract_tokenizer.py

Soumalya Das

Upload folder using huggingface_hub

b0986f4 verified 6 months ago

History Blame Contribute Delete

904 Bytes

	import pickle
	import json
	import sys
	import string

	class SimpleTokenizer:
	def __init__(self, vocab=None):
	self.vocab = vocab or {}

	def is_clean_token(t):
	return isinstance(t, str) and t.isprintable() and not any(c in t for c in "\u0000\uFFFD")

	try:
	with open("tokenizer.pkl", "rb") as f:
	tokenizer_obj = pickle.load(f)

	vocab = tokenizer_obj.vocab if hasattr(tokenizer_obj, "vocab") else tokenizer_obj

	clean_vocab = {
	k: v for k, v in vocab.items()
	if is_clean_token(k)
	}

	with open("tokenizer_vocab.json", "w", encoding="utf-8") as f:
	json.dump(clean_vocab, f, indent=2, ensure_ascii=True)

	print("✓ Clean vocab extracted")
	print(f"✓ Original size: {len(vocab)}")
	print(f"✓ Clean size: {len(clean_vocab)}")

	except Exception as e:
	print(f"✗ Error: {e}")
	sys.exit(1)