blaxx14 commited on
Commit
80c9d39
·
1 Parent(s): 0099550

create api

Browse files
Files changed (2) hide show
  1. app.py +153 -0
  2. requirements.txt +96 -0
app.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ import tensorflow as tf
3
+ from transformers import AutoTokenizer, TFT5ForConditionalGeneration
4
+ from transformers import MBartForConditionalGeneration, MBart50Tokenizer
5
+ import os
6
+ import re
7
+ import spacy
8
+ from nltk.corpus import wordnet as wn
9
+ import random
10
+ import nltk
11
+ nltk.download('wordnet')
12
+
13
+ nlp = spacy.load("en_core_web_sm")
14
+
15
+ app = Flask(__name__)
16
+
17
+ # Model uploaded configuration
18
+ LOCAL_QG_MODEL_PATH = "blaxx14/t5-question-generation"
19
+
20
+ """string into dictionary"""
21
+ def parse_to_dict(input_string):
22
+ try:
23
+ question_part, answer_part = input_string.split('Answer: ')
24
+ question = question_part.replace('Question: ', '').strip()
25
+ answer = answer_part.strip()
26
+
27
+ result_dict = {
28
+ "Question": question,
29
+ "Answer": answer
30
+ }
31
+
32
+ return result_dict
33
+
34
+ except ValueError:
35
+ print("Format input string tidak sesuai")
36
+ return None
37
+
38
+
39
+ """Find sinonim"""
40
+ def get_synonyms(word):
41
+ synonyms = set()
42
+ for syn in wn.synsets(word):
43
+ for lemma in syn.lemmas():
44
+ synonyms.add(lemma.name())
45
+ return list(synonyms)
46
+
47
+
48
+ """Create distractor"""
49
+ def generate_distractors(question, correct_answer):
50
+ doc = nlp(question)
51
+
52
+ keywords = [token.text for token in doc if token.pos_ in ['NOUN', 'PROPN']]
53
+
54
+ distractors = []
55
+
56
+ for keyword in keywords:
57
+ synonyms = get_synonyms(keyword)
58
+ synonyms = [word for word in synonyms if word.lower() != correct_answer.lower()]
59
+ distractors.extend(synonyms)
60
+
61
+ distractors = random.sample(distractors, min(3, len(distractors)))
62
+
63
+ return distractors
64
+
65
+ """Load question generator model and tokenizer"""
66
+ print("Loading model...")
67
+ model = TFT5ForConditionalGeneration.from_pretrained(LOCAL_QG_MODEL_PATH, from_pt=False)
68
+ tokenizer = AutoTokenizer.from_pretrained("t5-small")
69
+ print("Model loaded successfully.")
70
+
71
+ """Function for generate question"""
72
+ def generate_question(text, max_length=4096):
73
+ input_text = f"Generate question answer: {text}"
74
+ input_ids = tokenizer.encode(input_text, return_tensors="tf", max_length=512, truncation=True)
75
+
76
+ output = model.generate(
77
+ input_ids,
78
+ max_length=max_length,
79
+ num_beams=10,
80
+ top_k=0,
81
+ top_p=0.8,
82
+ temperature=1.5,
83
+ do_sample=True,
84
+ early_stopping=True
85
+ )
86
+
87
+ output_text = tokenizer.decode(output[0], skip_special_tokens=True)
88
+ return output_text
89
+
90
+ """Cleaning input"""
91
+ def clean_text(text):
92
+ cleaned_text = text.replace("translit.", "")
93
+ cleaned_text = re.sub(r'\[.*?\]', '', cleaned_text)
94
+ return cleaned_text
95
+
96
+ def split_text_into_sentences(paragraph):
97
+ text = clean_text(paragraph)
98
+ sentences = re.split(r'(?<=[.?!])\s+', text)
99
+ return sentences
100
+
101
+ def split_into_parts(sentences, num_parts=5):
102
+ if len(sentences) <= num_parts:
103
+ return sentences
104
+ else:
105
+ part_size = len(sentences) // num_parts
106
+ parts = [sentences[i:i + part_size] for i in range(0, len(sentences), part_size)]
107
+
108
+ if len(parts) > num_parts:
109
+ parts[-2].extend(parts[-1])
110
+ parts = parts[:-1]
111
+
112
+ return parts
113
+
114
+ """Route for run generator and save the results in cloud"""
115
+ @app.route('/generate-question', methods=['POST'])
116
+ def api_generate_question():
117
+ try:
118
+ data = request.json
119
+ text = data.get('text', '')
120
+
121
+ if not text:
122
+ return jsonify({'error': 'Text tidak boleh kosong'}), 400
123
+
124
+
125
+
126
+ """Run cleaning input"""
127
+ formatted_sentences = split_text_into_sentences(text)
128
+ parts = split_into_parts(formatted_sentences)
129
+
130
+
131
+ """Just for checking"""
132
+ #print(parts)
133
+
134
+ """Generate question"""
135
+ question_list = []
136
+
137
+ for sentence in parts:
138
+ combined_input = ' '.join(sentence)
139
+ result = generate_question(combined_input)
140
+ result_dict = parse_to_dict(result)
141
+ # print(result_dict)
142
+ distractors = generate_distractors(result_dict["Question"], result_dict["Answer"])
143
+ result_dict["distractor"] = distractors
144
+ question_list.append(result_dict)
145
+
146
+ print(question_list)
147
+ return jsonify({'generated_question': question_list})
148
+ except Exception as e:
149
+ return jsonify({'error': str(e)}), 500
150
+
151
+
152
+ if __name__ == '__main__':
153
+ app.run(host='0.0.0.0', port=8080)
requirements.txt ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ annotated-types==0.7.0
3
+ astunparse==1.6.3
4
+ blinker==1.9.0
5
+ blis==1.0.2
6
+ cachetools==5.5.0
7
+ catalogue==2.0.10
8
+ certifi==2024.8.30
9
+ charset-normalizer==3.4.0
10
+ click==8.1.7
11
+ cloudpathlib==0.20.0
12
+ colorama==0.4.6
13
+ confection==0.1.5
14
+ cymem==2.0.10
15
+ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85
16
+ filelock==3.16.1
17
+ Flask==3.1.0
18
+ flatbuffers==24.3.25
19
+ fsspec==2024.10.0
20
+ gast==0.6.0
21
+ # google-api-core==2.24.0
22
+ # google-auth==2.36.0
23
+ # google-cloud-core==2.4.1
24
+ # google-cloud-firestore==2.19.0
25
+ # google-pasta==0.2.0
26
+ # googleapis-common-protos==1.66.0
27
+ grpcio==1.68.1
28
+ grpcio-status==1.68.1
29
+ h5py==3.12.1
30
+ huggingface-hub==0.26.5
31
+ idna==3.10
32
+ itsdangerous==2.2.0
33
+ Jinja2==3.1.4
34
+ joblib==1.4.2
35
+ keras==3.7.0
36
+ langcodes==3.5.0
37
+ language_data==1.3.0
38
+ libclang==18.1.1
39
+ marisa-trie==1.2.1
40
+ Markdown==3.7
41
+ markdown-it-py==3.0.0
42
+ MarkupSafe==3.0.2
43
+ mdurl==0.1.2
44
+ ml-dtypes==0.4.1
45
+ mpmath==1.3.0
46
+ murmurhash==1.0.11
47
+ namex==0.0.8
48
+ networkx==3.4.2
49
+ nltk==3.9.1
50
+ numpy==2.0.2
51
+ opt_einsum==3.4.0
52
+ optree==0.13.1
53
+ packaging==24.2
54
+ preshed==3.0.9
55
+ proto-plus==1.25.0
56
+ protobuf==5.29.1
57
+ pyasn1==0.6.1
58
+ pyasn1_modules==0.4.1
59
+ pydantic==2.10.3
60
+ pydantic_core==2.27.1
61
+ Pygments==2.18.0
62
+ PyYAML==6.0.2
63
+ regex==2024.11.6
64
+ requests==2.32.3
65
+ rich==13.9.4
66
+ rsa==4.9
67
+ safetensors==0.4.5
68
+ sentencepiece==0.2.0
69
+ shellingham==1.5.4
70
+ six==1.17.0
71
+ smart-open==7.0.5
72
+ spacy==3.8.2
73
+ spacy-legacy==3.0.12
74
+ spacy-loggers==1.0.5
75
+ srsly==2.5.0
76
+ sympy==1.13.1
77
+ tensorboard==2.18.0
78
+ tensorboard-data-server==0.7.2
79
+ tensorflow==2.18.0
80
+ tensorflow-io-gcs-filesystem==0.31.0
81
+ tensorflow_intel==2.18.0
82
+ termcolor==2.5.0
83
+ tf_keras==2.18.0
84
+ thinc==8.3.2
85
+ tiktoken==0.8.0
86
+ tokenizers==0.21.0
87
+ torch==2.5.1
88
+ tqdm==4.67.1
89
+ transformers==4.47.0
90
+ typer==0.15.1
91
+ typing_extensions==4.12.2
92
+ urllib3==2.2.3
93
+ wasabi==1.1.3
94
+ weasel==0.4.1
95
+ Werkzeug==3.1.3
96
+ wrapt==1.17.0