Latvian named entity recognition (NER)

Dataset

Trained on the FullStack dataset.

Results

Results on the test split:

Label Precision Recall F1 Score
Micro Avg 87.2 87.9 87.6
Macro Avg 76.6 73.1 73.8
GPE 93.2 93.2 93.2
entity 50.0 55.2 52.5
event 72.0 81.8 76.6
location 81.5 78.6 80.0
money 60.0 25.0 35.3
organization 87.2 89.2 88.2
person 96.5 98.4 97.4
product 75.0 58.1 65.5
time 73.8 78.3 75.9

Usage

import re

import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer


class NER:
    def __init__(self, model_name='AiLab-IMCS-UL/lv-ner-v1', max_length=1024):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForTokenClassification.from_pretrained(model_name).eval()
        self.id2label = self.model.config.id2label
        self.max_length = max_length

    def predict(self, text):
        pretokenized = list(re.finditer(r'\w+|\S', text))
        if not pretokenized:
            return []

        enc = self.tokenizer([m.group(0) for m in pretokenized], is_split_into_words=True, return_tensors='pt', truncation=True, max_length=self.max_length)
        word_ids = enc.word_ids(0)

        with torch.no_grad():
            preds = self.model(**enc).logits.argmax(-1)[0].tolist()

        offsets = [(m.start(), m.end()) for m in pretokenized]
        ents, cur, prev = [], None, None

        for pred, wid in zip(preds, word_ids):
            if wid is None or wid == prev:
                prev = wid
                continue
            prev = wid

            start, end = offsets[wid]
            raw_label = self.id2label[pred]
            if raw_label == 'O':
                if cur:
                    ents.append(cur)
                    cur = None
                continue

            prefix, label = raw_label.split('-', 1) if '-' in raw_label else ('B', raw_label)
            if prefix == 'B' or not cur or cur['label'] != label:
                if cur:
                    ents.append(cur)
                cur = {'start': start, 'end': end, 'label': label}
            else:
                cur['end'] = end

        if cur:
            ents.append(cur)

        for ent in ents:
            ent['text'] = text[ent['start']:ent['end']]

        return ents

m = NER()
print(m.predict('Jānis Bērziņš strādā Latvijas uzņēmumā SIA Mia.'))
Downloads last month
51
Safetensors
Model size
0.1B params
Tensor type
F32
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Model tree for AiLab-IMCS-UL/lv-ner-v1

Finetuned
(1)
this model