Latvian named entity recognition (NER)

Dataset

Results

Results on the test split:

Label	Precision	Recall	F1 Score
Micro Avg	87.2	87.9	87.6
Macro Avg	76.6	73.1	73.8
GPE	93.2	93.2	93.2
entity	50.0	55.2	52.5
event	72.0	81.8	76.6
location	81.5	78.6	80.0
money	60.0	25.0	35.3
organization	87.2	89.2	88.2
person	96.5	98.4	97.4
product	75.0	58.1	65.5
time	73.8	78.3	75.9

Usage

import re

import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer


class NER:
    def __init__(self, model_name='AiLab-IMCS-UL/lv-ner-v1', max_length=1024):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForTokenClassification.from_pretrained(model_name).eval()
        self.id2label = self.model.config.id2label
        self.max_length = max_length

    def predict(self, text):
        pretokenized = list(re.finditer(r'\w+|\S', text))
        if not pretokenized:
            return []

        enc = self.tokenizer([m.group(0) for m in pretokenized], is_split_into_words=True, return_tensors='pt', truncation=True, max_length=self.max_length)
        word_ids = enc.word_ids(0)

        with torch.no_grad():
            preds = self.model(**enc).logits.argmax(-1)[0].tolist()

        offsets = [(m.start(), m.end()) for m in pretokenized]
        ents, cur, prev = [], None, None

        for pred, wid in zip(preds, word_ids):
            if wid is None or wid == prev:
                prev = wid
                continue
            prev = wid

            start, end = offsets[wid]
            raw_label = self.id2label[pred]
            if raw_label == 'O':
                if cur:
                    ents.append(cur)
                    cur = None
                continue

            prefix, label = raw_label.split('-', 1) if '-' in raw_label else ('B', raw_label)
            if prefix == 'B' or not cur or cur['label'] != label:
                if cur:
                    ents.append(cur)
                cur = {'start': start, 'end': end, 'label': label}
            else:
                cur['end'] = end

        if cur:
            ents.append(cur)

        for ent in ents:
            ent['text'] = text[ent['start']:ent['end']]

        return ents

m = NER()
print(m.predict('Jānis Bērziņš strādā Latvijas uzņēmumā SIA Mia.'))

Downloads last month: 51

Safetensors

Model size

0.1B params

Tensor type

F32

Model tree for AiLab-IMCS-UL/lv-ner-v1

Base model

AiLab-IMCS-UL/lv-deberta-base

Finetuned

(1)

this model