Zaruhi commited on
Commit
4be827f
·
0 Parent(s):

Initial release

Browse files
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ language:
4
+ - en
5
+ - hy
6
+ base_model:
7
+ - intfloat/multilingual-e5-base
8
+ tags:
9
+ - sentence-transformers
10
+ - ArmBench-TextEmbed
11
+ license: mit
12
+ datasets:
13
+ - Metric-AI/reddit_10K_hye
14
+ ---
15
+
16
+ # Armenian-Text-Embeddings-2 (ATE-2)
17
+
18
+ ## Model Details
19
+ - **Model Name**: Armenian-Text-Embeddings-2-base
20
+ - **Model Type**: Text Embeddings for Armenian Language
21
+ - **Base Model**: intfloat/multilingual-e5-base
22
+ - **Version**: 2.0
23
+ - **Last Updated**: March 2026
24
+ - **Model Architecture**: Transformer-based embeddings model
25
+ - **Input**: Armenian text
26
+ - **Output**: Dense vector embeddings
27
+
28
+ ## Quick Start
29
+ ```python
30
+ import torch.nn.functional as F
31
+
32
+ from torch import Tensor
33
+ from transformers import AutoTokenizer, AutoModel
34
+
35
+ tokenizer = AutoTokenizer.from_pretrained('Metric-AI/armenian-text-embeddings-2-base')
36
+ model = AutoModel.from_pretrained('Metric-AI/armenian-text-embeddings-2-base')
37
+
38
+
39
+ def average_pool(last_hidden_states: Tensor,
40
+ attention_mask: Tensor) -> Tensor:
41
+ last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
42
+ return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
43
+
44
+
45
+ # Each input text should start with "query: " or "passage: ", even for non-English texts.
46
+ # For tasks other than retrieval, you can simply use the "query: " prefix.
47
+ input_texts = [
48
+ 'query: Ինչպե՞ս պատրաստել տոլմա', # How to make tolma
49
+ 'query: Քանի՞ գրամ սպիտակուց է հարկավոր օրական', # How many grams of protein needed daily
50
+
51
+ """passage: Տոլմայի բաղադրատոմս՝
52
+ Բաղադրիչներ՝
53
+ - 500գ աղացած միս
54
+ - 1 բաժակ բրինձ
55
+ - Խաղողի տերևներ
56
+ - 2 գլուխ սոխ
57
+ - Համեմունքներ՝ աղ, սև պղպեղ, քարի
58
+
59
+ Պատրաստման եղանակը՝
60
+ 1. Միսը խառնել բրնձի, մանր կտրատած սոխի և համեմունքների հետ
61
+ 2. Խաղողի տերևները լվանալ և թողնել տաք ջրի մեջ 10 րոպե
62
+ 3. Լցոնել տերևները և դասավորել կաթսայի մեջ
63
+ 4. Եփել դանդաղ կրակի վրա 45-60 րոպե""", # Detailed tolma recipe
64
+
65
+ """passage: Սպիտակուցի օրական չափաբաժինը կախված է մարդու քաշից, սեռից և ֆիզիկական ակտիվությունից:
66
+ Միջին հաշվով, կանանց համար խորհուրդ է տրվում 46-50 գրամ սպիտակուց օրական:
67
+ Մարզիկների համար այս թիվը կարող է հասնել մինչև 1.6-2 գրամ մարմնի քաշի յուրաքանչյուր կիլոգրամի համար:
68
+ Հղիների համար պահանջվում է լրացուցիչ 25 գրամ սպիտակուց:
69
+
70
+ Սպիտակուցի հարուստ աղբյուրներ են՝
71
+ - Հավի միս (31գ/100գ)
72
+ - Ձու (13գ/100գ)
73
+ - Ոսպ (25գ/100գ)
74
+ - Մածուն (3.5գ/100գ)"""] # Detailed protein intake advice
75
+
76
+ # Tokenize the input texts
77
+ batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
78
+ outputs = model(**batch_dict)
79
+ embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
80
+
81
+ # normalize embeddings
82
+ embeddings = F.normalize(embeddings, p=2, dim=1)
83
+ scores = (embeddings[:2] @ embeddings[2:].T) * 100
84
+ print(scores.tolist())
85
+
86
+ # [[80.32380676269531, 42.62339401245117], [44.12641525268555, 78.37036895751953]]
87
+ ```
88
+
89
+ ## Support for Sentence Transformers
90
+
91
+ Below is an example for usage with sentence_transformers.
92
+ ```python
93
+ from sentence_transformers import SentenceTransformer
94
+ model = SentenceTransformer('Metric-AI/armenian-text-embeddings-2-base')
95
+
96
+ embeddings = model.encode(input_texts, normalize_embeddings=True)
97
+ ```
98
+
99
+
100
+ ## Intended Use
101
+ ### Primary Intended Uses
102
+ - Retrieval-augmented generation (RAG)
103
+ - Semantic search in Armenian
104
+ - Document similarity computation
105
+ - Cross-lingual text understanding
106
+ - Text classification tasks
107
+ - Information retrieval
108
+
109
+
110
+ ## Training Procedure
111
+ This model was trained following the recipe described in *Less is More: Adapting Text Embeddings
112
+ for Low-Resource Languages with Small Scale Noisy Synthetic Data*.
113
+
114
+ ### Training Details
115
+ - **Weight Averaging**:
116
+ - Base model (multilingual-e5-base): 0.5 weight
117
+ - Fine-tuned checkpoint 1 (armenian only): 0.25 weight
118
+ - Fine-tuned checkpoint 2 (armenian + translit): 0.25 weight
119
+ - **Hardware**: 8 x MI250x GPUs
120
+ - **Training Parameters**:
121
+ - Epochs: 5
122
+ - Batch Size: 1024 per GPU
123
+ - Learning Rate: 7e-5
124
+ - Weight Decay: 0.01
125
+ - Warmup Ratio: 0.2
126
+ - Maximum Sequence Length: 128 tokens
127
+ - FP16 Training: Enabled
128
+ - Gradient Clipping: 1.0
129
+
130
+ ### Optimization Configuration
131
+ - **Framework**: DeepSpeed Stage 2
132
+ - **Optimizer**: AdamW with auto weight decay
133
+ - **Mixed Precision**: FP16 with dynamic loss scaling
134
+ - **ZeRO Optimization**: Stage 2 with:
135
+ - Allgather partitions
136
+ - Overlap communications
137
+ - Contiguous gradients
138
+ - **Additional Features**:
139
+ - Gradient checkpointing
140
+ - Tensor parallelism (size: 2)
141
+
142
+ ## Performance and Limitations
143
+ ### Capabilities
144
+ - Effective for semantic similarity tasks in Armenian
145
+ - Suitable for document classification and clustering
146
+ - Transliterated query handling
147
+
148
+ ### Limitations
149
+ - Performance may vary on domain-specific terminology
150
+ - May not capture Armenian-specific cultural contexts effectively
151
+ - Limited by the quality of training data translations
152
+
153
+ ### Known Biases
154
+ - May exhibit biases present in Reddit content
155
+
156
+
157
+ ## Ethical Considerations
158
+ - **Data Privacy**: Training data from public Reddit content
159
+ - **Potential Misuse**: Could be misused for content manipulation or spam
160
+ - **Bias**: May perpetuate social biases present in Reddit content
161
+ - **Recommendations**:
162
+ - Monitor system outputs for harmful content
163
+ - Implement content filtering for production use
164
+ - Regular bias assessment recommended
165
+
166
+ ## Technical Specifications
167
+ - **Model Size**: 0.3B parameters (based on e5-base)
168
+ - **Embedding Dimension**: 768
169
+ - **Max Sequence Length**: 512 tokens
170
+ - **Framework Compatibility**:
171
+ - PyTorch
172
+ - Hugging Face Transformers
173
+ - DeepSpeed
174
+
175
+ ## Citation
176
+ ```bibtex
177
+ @misc{armenian-text-embeddings-2-base,
178
+ author = {Navasardyan, Zaruhi and Bughdaryan, Spartak and Minasyan, Bagratuni and Davtyan, Hrant},
179
+ title = {Armenian-Text-Embeddings-2-base: Enhanced Armenian Language Embeddings},
180
+ year = {2026}
181
+ }
182
+
183
+ @inproceedings{navasardyan2026lessismore,
184
+ title={Less is More: Adapting Text Embeddings for Low-Resource Languages with Small Scale Noisy Synthetic Data},
185
+ author={Navasardyan, Zaruhi and Bughdaryan, Spartak and Minasyan, Bagratuni and Davtyan, Hrant},
186
+ booktitle={Proceedings of the Workshop on Language Models for Low-Resource Languages (LoResLM) at EACL 2026},
187
+ year={2026}
188
+ }
189
+ ```
190
+
191
+ ## Additional Information
192
+ ### Base Model References
193
+ - multilingual-e5-base: [https://huggingface.co/intfloat/multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base)
194
+
195
+ ### Acknowledgments
196
+ - intfloat for the original multilingual-e5-base model
197
+ - Reddit community for the source content
198
+ - DeepSpeed team for optimization toolkit
199
+ - EuroHPC Joint Undertaking for granting access to the LUMI supercomputer, hosted by CSC (Finland) and the LUMI consortium
200
+
201
+
202
+ ## Version History
203
+ - 1.0 (November 2024): Initial release
204
+ - 2.0 (March 2026)
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "XLMRobertaModel"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "xlm-roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "output_past": true,
21
+ "pad_token_id": 1,
22
+ "position_embedding_type": "absolute",
23
+ "transformers_version": "4.57.3",
24
+ "type_vocab_size": 1,
25
+ "use_cache": true,
26
+ "vocab_size": 250002
27
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a496ec51934b0ad32db4cbfb4afe0351b9e338cefaf4ff613bf0f0548a002b6
3
+ size 1112197096
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a56def25aa40facc030ea8b0b87f3688e4b3c39eb8b45d5702b3a1300fe2a20
3
+ size 17082734
tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "<mask>",
50
+ "model_max_length": 512,
51
+ "pad_token": "<pad>",
52
+ "sep_token": "</s>",
53
+ "tokenizer_class": "XLMRobertaTokenizerFast",
54
+ "unk_token": "<unk>"
55
+ }