Spaces:

OpenDCAI
/

DataFlow-VQA

Running

App Files Files Community

aaron1141 commited on 25 days ago

Commit

e783436

0 Parent(s):

initial hf spaces demo

Browse files

Files changed (26) hide show

.gitignore +0 -0
LICENSE +170 -0
README.md +316 -0
README_zh.md +301 -0
app.py +166 -0
examples/VQA/vqa_extract_test.jsonl +2 -0
operators/answer_extractor.py +90 -0
operators/bench_evaluate.py +307 -0
operators/pdf2vqa/__init__.py +4 -0
operators/pdf2vqa/llm_output_parser.py +146 -0
operators/pdf2vqa/mineru_to_llm_input_operator.py +76 -0
operators/pdf2vqa/pdf_merger.py +83 -0
operators/pdf2vqa/qa_merger.py +84 -0
operators/question_answer_clean.py +104 -0
operators/question_refiner.py +66 -0
operators/vqa_answer_generator.py +229 -0
pipelines/curate_data.py +271 -0
pipelines/generate_cot.py +166 -0
pipelines/vqa_extract_optimized_pipeline.py +134 -0
prompts/bench_evaluate.py +82 -0
prompts/curate_data.py +229 -0
prompts/pdf2vqa.py +75 -0
prompts/question_answer_clean.py +67 -0
prompts/question_refine.py +58 -0
requirements.txt +9 -0
utils/format_utils.py +136 -0

.gitignore ADDED Viewed

Binary file (126 Bytes). View file

LICENSE ADDED Viewed

	@@ -0,0 +1,170 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship made available under
+      the License, as indicated by a copyright notice that is included in
+      or attached to the work (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and derivative works thereof.
+      "Contribution" shall mean, as submitted to the Licensor for inclusion
+      in the Work by the copyright owner or by an individual or Legal Entity
+      authorized to submit on behalf of the copyright owner. For the purposes
+      of this definition, "submitted" means any form of electronic, verbal,
+      or written communication sent to the Licensor or its representatives,
+      including but not limited to communication on electronic mailing lists,
+      source code control systems, and issue tracking systems that are managed
+      by, or on behalf of, the Licensor for the purpose of discussing and
+      improving the Work, but excluding communication that is conspicuously
+      marked or otherwise designated in writing by the copyright owner as
+      "Not a Contribution."
+      "Contributor" shall mean Licensor and any Legal Entity on behalf of
+      whom a Contribution has been received by the Licensor and subsequently
+      incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contributions
+      with the Work to which such Contributions were submitted. If You
+      institute patent litigation against any Legal Entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, You must include a readable copy of the
+          attribution notices contained within such NOTICE file, in
+          at least one of the following places: within a NOTICE text
+          file distributed as part of the Derivative Works; within
+          the Source form or documentation, if provided along with the
+          Derivative Works; or, within a display generated by the
+          Derivative Works, if and wherever such third-party notices
+          normally appear. The contents of the NOTICE file are for
+          informational purposes only and do not modify the License.
+          You may add Your own attribution notices within Derivative
+          Works that You distribute, alongside or as an addendum to
+          the NOTICE text from the Work, provided that such additional
+          attribution notices cannot be construed as modifying the License.
+      You may add Your own license statement for Your modifications and
+      may provide additional grant of rights to use, copy, modify, merge,
+      publish, distribute, sublicense, and/or sell copies of the
+      Derivative Works, and to permit persons to whom the Derivative Works
+      is furnished to do so, subject to the following conditions:
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or exemplary damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or all other
+      commercial damages or losses), even if such Contributor has been
+      advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS

README.md ADDED Viewed

	@@ -0,0 +1,316 @@

+---
+title: DataFlow-VQA Data Curation
+emoji: 🔬
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 4.44.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+# DataFlow-VQA
+**[中文文档](README_zh.md)**
+A pipeline for extracting, curating, and generating chain-of-thought (CoT) data from PDF textbooks and exam papers.
+[🤗Dataset](https://huggingface.co/datasets/OpenDCAI/FlipVQA)
+## Overview
+![DataFlow-VQA overview](static/overview_2.png)
+DataFlow-VQA processes PDF documents through three sequential stages:
+- Stage1 (**Section 3.1: VQA Extraction**): Parses PDFs using [MinerU](https://github.com/opendatalab/MinerU) for document layout analysis, then uses an LLM to extract structured question-answer pairs with images.
+- Stage2 (**Section 3.2.1 to Section 3.2.5: Data Curation**): Filters and cleans the extracted QA pairs — splits sub-questions, classifies question types, extracts concise answers, and removes low-quality items.
+- Stage3 (**Section 3.2.6: CoT Generation**): Generates chain-of-thought reasoning via reject sampling — an LLM generates answers, which are verified against ground truth, and incorrect ones are retried.
+## Installation
+This project is built on top of [DataFlow](https://github.com/OpenDCAI/DataFlow). Clone and install it first:
+```shell
+git clone https://github.com/OpenDCAI/DataFlow.git
+cd DataFlow
+pip install -e ".[pdf2vqa]"
+```
+Then clone this repository:
+```shell
+git clone <this-repo-url>
+cd DataFlow-VQA
+```
+## Configuration
+### API Keys
+Two API keys are required:
+- `DF_API_KEY`: API key for the LLM service (OpenAI, Google Gemini, DeepSeek, etc.)
+- `MINERU_API_KEY`: API key for [MinerU](https://mineru.net/apiManage/token) document layout parsing
+```shell
+export DF_API_KEY="sk-xxxxx"
+export MINERU_API_KEY="sk2-xxxxx"
+```
+### LLM Endpoint
+Each pipeline accepts `--api_url` and `--model` arguments. Any [OpenAI-compatible API](https://platform.openai.com/docs/api-reference) endpoint is supported, including OpenAI, Google Gemini (via proxy), DeepSeek, and others.
+Provide the **base URL** without `/chat/completions` (e.g. `https://api.openai.com/v1`).
+---
+## Stage 1: VQA Extraction
+### Input Format
+Create a JSONL file where each line describes one PDF extraction task:
+```jsonl
+{"input_pdf_paths": "./examples/VQA/questionextract_test.pdf", "name": "math1"}
+{"input_pdf_paths": ["./examples/VQA/math_question.pdf", "./examples/VQA/math_answer.pdf"], "name": "math2"}
+```
+- `input_pdf_paths`: A single PDF (questions and answers interleaved) or a list of two or more PDFs (questions before answers).
+- `name`: A unique identifier for this task (used for directory naming and caching).
+### Run
+```bash
+python -m pipelines.vqa_extract_optimized_pipeline \
+    --input_file ./examples/VQA/vqa_extract_test.jsonl \
+    --output_dir ./output \
+    --api_url https://generativelanguage.googleapis.com/v1beta/openai/ \
+    --model gemini-2.5-pro
+```
+**Important:** We recommend using a strong powerful model here. Weak models like `gpt-5-mini` might perform bad.
+### Output
+- `{output_dir}/raw_vqa.jsonl`: Extracted QA pairs with image references
+- `{output_dir}/{name}/vqa_images/`: Extracted images
+- `cache/{name}/extracted_vqa.jsonl`, `merged_qa_pairs.jsonl`, `merged_qa_pairs.md`: Per-task intermediate files
+Each QA item contains:
+```json
+{
+  "question": "Compute $x$ such that $x^2 - 1 = 0$.",
+  "answer": "$x = 1$ or $x = -1$",
+  "solution": "Factor as $(x-1)(x+1)=0$.",
+  "label": 1,
+  "question_chapter_title": "Chapter 1: Quadratic Equations",
+  "answer_chapter_title": "Chapter 1: Quadratic Equations",
+  "image_basedir": "/path/to/your/images"
+}
+```
+### Note
+**We also support using a local MinerU deployment**: Replace `FileOrURLToMarkdownConverterAPI` with `FileOrURLToMarkdownConverterLocal` or `FileOrURLToMarkdownConverterFlash` in `pipelines/vqa_extract_optimized_pipeline.py`:
+```python
+# Original opendatalab local version
+self.mineru_executor = FileOrURLToMarkdownConverterLocal(
+    intermediate_dir="intermediate",
+    mineru_model_path="path/to/mineru/model",
+)
+# Accelerated version (Flash)
+self.mineru_executor = FileOrURLToMarkdownConverterFlash(
+    intermediate_dir="intermediate",
+    mineru_model_path="path/to/mineru/model",
+    batch_size=4,
+    replicas=1,
+    num_gpus_per_replica=1,
+    engine_gpu_util_rate_to_ray_cap=0.9,
+)
+```
+See [DataFlow's MinerU operators](https://github.com/OpenDCAI/DataFlow/blob/main/dataflow/operators/knowledge_cleaning/generate/mineru_operators.py) for full parameter documentation.
+<details>
+<summary>Pipeline details</summary>
+The extraction pipeline runs six steps:
+1. **PDF Merging** (`PDF_Merger`): If multiple PDFs are provided, merges them into one.
+2. **Document Layout Parsing** (`FileOrURLToMarkdownConverterAPI`): Calls the MinerU API to produce structured JSON layout tokens and page images.
+3. **Layout Preprocessing** (`MinerU2LLMInputOperator`): Flattens list items and re-indexes IDs to prepare LLM-ready input.
+4. **LLM Extraction** (`ChunkedPromptedGenerator`): Chunks the layout JSON (max 128k tokens per chunk) and calls the LLM with `QAExtractPrompt` to extract QA pairs as structured XML.
+5. **Output Parsing** (`LLMOutputParser`): Parses the XML response into JSONL and copies images to `vqa_images/`.
+6. **QA Merging** (`QA_Merger`): For separated question/answer PDFs, matches question and answer blocks by chapter title and question number.
+This operator includes a `strict_title_match` parameter: When set to True, the operator performs an exact string match on chapter titles. Otherwise, the operator attempts to extract Chinese or English sequence numbers from the titles for matching.
+</details>
+---
+## Stage 2: Data Curation
+```bash
+python -m pipelines.curate_data \
+    --input_file ./output/raw_vqa.jsonl \
+    --api_url https://api.openai.com/v1 \
+    --model gpt-5-mini
+```
+Output is saved as `curated_vqa.jsonl` in the same directory as `--input_file`.
+<details>
+<summary>Pipeline details</summary>
+Four sequential steps:
+**1. Sub-question Splitting**
+Questions with multiple independent parts (e.g. (a), (b), (c)) are split into separate items. Each sub-question is paired with its corresponding sub-answer and sub-solution. Items where the question or both answer and solution are empty are discarded.
+Sub-questions that are context-sensitive (e.g. (b) uses the result of (a)) will not be split into separate items.
+Adds field: `split_qa`
+**2. Question Type Classification**
+Each question is classified as one of: `Calculation`, `Proof`, `Explanation`, `Fill-in`, `Multiple-choice`, `Sketching`, `Other`.
+By default, only `Calculation`, `Fill-in`, and `Multiple-choice` are retained. To change this, edit the `filter_rules` list in `DataCurationPipeline.__init__`.
+Adds fields: `type`, `type_reason`
+**3. Answer Extraction**
+Extracts a concise final answer from the `solution` field and writes it to `answer`. Items that already have a non-empty `answer` are skipped (set `overwrite=True` in `AnswerExtractionOperator` to override).
+**4. QA Filtering**
+Removes items based on the following criteria:
+- The question must pose a clear, specific problem suitable for an exam. Examples, statements without questions, and open-ended discussions are rejected.
+- The answer must directly address the question.
+- The question and answer must be self-contained, without relying on external references or omitted context.
+Adds fields: `filter_result`, `filter_reason`
+</details>
+---
+## Stage 3: Generate CoT
+The answer model and judge model can use different API endpoints and API keys, which is useful when the answer model is a self-hosted open-source VLM (e.g. Qwen3-VL served via vLLM) and the judge model is a commercial API.
+Use `--answer_api_key_env` / `--judge_api_key_env` to specify which environment variable holds the API key for each model (default: `DF_API_KEY` for both).
+```bash
+# Example: self-hosted Qwen3-VL for answers, OpenAI for judging
+export VLLM_API_KEY="token-xxxx"   # or leave empty if your vLLM server needs no key
+export DF_API_KEY="sk-xxxx"
+python -m pipelines.generate_cot \
+    --input_file ./output/curated_vqa.jsonl \
+    --max_retries 5 \
+    --answer_api_url https://your-vllm-server/v1 \
+    --answer_model qwen3-vl-235b-thinking \
+    --answer_api_key_env VLLM_API_KEY \
+    --judge_api_url https://api.openai.com/v1 \
+    --judge_model gpt-5-mini \
+    --judge_api_key_env DF_API_KEY
+```
+Output is saved as `curated_vqa_with_cot.jsonl` in the same directory as `--input_file`.
+<details>
+<summary>Pipeline details</summary>
+Uses reject sampling over up to `max_retries` rounds:
+**1. Answer Generation** (`VQAReasoningAnswerGenerator`)
+The LLM generates a step-by-step answer. Set `skip_text_only=True` in `RejectSamplingPipeline` to process only VQA items (questions containing images); set to `False` to process all items. Generated answer stored in `generated_cot`.
+**2. Thinking Cleanup**
+Strips `<think>...</think>` content from the generated answer to reduce verification cost. The cleaned answer is stored in `llm_short_answer`. Assumes the model outputs `<think>THINK</think>ANSWER` or `THINK</think>ANSWER`.
+**3. Answer Verification** (`BenchDatasetEvaluatorQuestion`)
+Compares `llm_short_answer` against the ground truth `answer` using semantic LLM evaluation (with 5% numerical tolerance). Items that pass are marked `answer_match_result = True` and skipped in subsequent rounds.
+Set `support_subquestions=True` to evaluate each sub-question independently; `answer_match_result` is `False` if any sub-question is wrong.
+Evaluation statistics (overall accuracy, sub-question accuracy) are saved to `./cot_cache/eval_results.jsonl`:
+```json
+{
+  "total_samples": 23584,
+  "matched_samples": 12281,
+  "accuracy": 0.521,
+  "total_subquestions": 26380,
+  "correct_subquestions": 13807,
+  "subquestion_accuracy": 0.523
+}
+```
+</details>
+---
+## Examples
+Sample PDFs and input JSONL are provided in `examples/VQA/`:
+```
+examples/VQA/
+├── vqa_extract_test.jsonl    # Example input for Stage 1
+├── questionextract_test.pdf  # Single PDF with interleaved Q&A
+├── math_question.pdf         # Questions PDF (for separated Q&A demo)
+└── math_answer.pdf           # Answers PDF (for separated Q&A demo)
+```
+To run the full pipeline on the examples:
+```bash
+# Stage 1: Extract
+python -m pipelines.vqa_extract_optimized_pipeline \
+    --input_file ./examples/VQA/vqa_extract_test.jsonl \
+    --output_dir ./output \
+    --api_url https://generativelanguage.googleapis.com/v1beta/openai/ \
+    --model gemini-2.5-pro
+# Stage 2: Curate
+python -m pipelines.curate_data \
+    --input_file ./output/raw_vqa.jsonl \
+    --api_url https://api.openai.com/v1 \
+    --model gpt-5-mini
+# Stage 3: Generate CoT
+# Example: self-hosted Qwen3-VL for answers, OpenAI for judging
+export VLLM_API_KEY="token-xxxx"   # or leave empty if your vLLM server needs no key
+export DF_API_KEY="sk-xxxx"
+python -m pipelines.generate_cot \
+    --input_file ./output/curated_vqa.jsonl \
+    --max_retries 5 \
+    --answer_api_url https://your-vllm-server/v1 \
+    --answer_model qwen3-vl-235b-thinking \
+    --answer_api_key_env VLLM_API_KEY \
+    --judge_api_url https://api.openai.com/v1 \
+    --judge_model gpt-5-mini \
+    --judge_api_key_env DF_API_KEY
+```
+## Note
+The implementation in this repository is only for running a demo at small scale. If you wish to run the pipeline on large number of books, you will probably need features [Checkpoint Resume](https://opendcai.github.io/DataFlow-Doc/en/guide/resume/) and [Batched Inference](https://opendcai.github.io/DataFlow-Doc/en/guide/batch/).
+## License
+This project is licensed under the [Apache License 2.0](LICENSE).

README_zh.md ADDED Viewed

	@@ -0,0 +1,301 @@

+# DataFlow-VQA
+从 PDF 教材和试卷中提取、清洗、生成思维链（CoT）数据的流水线工具。
+[🤗数据集](https://huggingface.co/datasets/OpenDCAI/FlipVQA)
+## 概览
+DataFlow-VQA 通过三个顺序阶段处理 PDF 文件：
+![DataFlow-VQA overview](static/overview_2.png)
+- 第一步（**Section 3.1：VQA 抽取**）：使用 [MinerU](https://github.com/opendatalab/MinerU) 进行文档版面分析，再用 LLM 从中抽取带图片的结构化问答对。
+- 第二步（**Section 3.2.1 到 Section 3.2.5：数据清洗**）：对抽取到的问答对进行过滤和清洗——拆分小题、判断题型、抽取简洁答案、去除低质量内容。
+- 第三步（**Section 3.2.6：生成 CoT**）：通过 Reject Sampling 生成思维链——LLM 生成回答，与标准答案核对，答错的重新生成。
+## 安装
+本项目基于 [DataFlow](https://github.com/OpenDCAI/DataFlow)，请先 clone 并安装：
+```shell
+git clone https://github.com/OpenDCAI/DataFlow.git
+cd DataFlow
+pip install -e ".[pdf2vqa]"
+```
+然后 clone 本仓库：
+```shell
+git clone <this-repo-url>
+cd DataFlow-VQA
+```
+## 配置
+### API 密钥
+需要两个 API Key：
+- `DF_API_KEY`：LLM 服务的 API Key（OpenAI、Google Gemini、DeepSeek 等均可）
+- `MINERU_API_KEY`：[MinerU](https://mineru.net/apiManage/token) 文档版面解析的 API Key
+```shell
+export DF_API_KEY="sk-xxxxx"
+export MINERU_API_KEY="sk2-xxxxx"
+```
+### LLM 端点
+每个 pipeline 均支持 `--api_url` 和 `--model` 参数，可兼容任何 [OpenAI 兼容接口](https://platform.openai.com/docs/api-reference)（OpenAI、Gemini 代理、DeepSeek 等）。
+`--api_url` 传入**基础 URL**（不含 `/chat/completions`），例如 `https://api.openai.com/v1`。
+---
+## 第一步：VQA 抽取
+### 输入格式
+创建一个 JSONL 文件，每行描述一个抽取任务：
+```jsonl
+{"input_pdf_paths": "./examples/VQA/questionextract_test.pdf", "name": "math1"}
+{"input_pdf_paths": ["./examples/VQA/math_question.pdf", "./examples/VQA/math_answer.pdf"], "name": "math2"}
+```
+- `input_pdf_paths`：单个 PDF（题目和答案混排），或两个及更多的 PDF 的列表（题目pdf放在问题pdf前面）。
+- `name`：该任务的唯一标识符（用于目录命名和缓存）。
+### 运行
+```bash
+python -m pipelines.vqa_extract_optimized_pipeline \
+    --input_file ./examples/VQA/vqa_extract_test.jsonl \
+    --output_dir ./output \
+    --api_url https://generativelanguage.googleapis.com/v1beta/openai/ \
+    --model gemini-2.5-pro
+```
+**重要：** 我们推荐在这里使用强推理模型。较弱的模型比如`gpt-5-mini`在这一阶段可能表现较差。
+### 输出
+- `{output_dir}/raw_vqa.jsonl`：包含图片引用的问答对
+- `{output_dir}/{name}/vqa_images/`：抽取出的图片
+- `cache/{name}/`：中间文件（`extracted_vqa.jsonl`、`merged_qa_pairs.jsonl`、`merged_qa_pairs.md`）
+每个 QA 条目包含：
+```json
+{
+  "question": "计算 $x$ 使得 $x^2 - 1 = 0$。",
+  "answer": "$x = 1$ 或 $x = -1$",
+  "solution": "因式分解 $(x-1)(x+1)=0$。",
+  "label": 1,
+  "question_chapter_title": "第一章 二次方程",
+  "answer_chapter_title": "第一章 二次方程",
+  "image_basedir": "/path/to/your/images"
+}
+```
+### 提示
+**我们也支持使用本地 MinerU 部署**：在 `pipelines/vqa_extract_optimized_pipeline.py` 中替换算子：
+```python
+# 原版 opendatalab 本地版
+self.mineru_executor = FileOrURLToMarkdownConverterLocal(
+    intermediate_dir="intermediate",
+    mineru_model_path="path/to/mineru/model",
+)
+# 加速版 Flash
+self.mineru_executor = FileOrURLToMarkdownConverterFlash(
+    intermediate_dir="intermediate",
+    mineru_model_path="path/to/mineru/model",
+    batch_size=4,
+    replicas=1,
+    num_gpus_per_replica=1,
+    engine_gpu_util_rate_to_ray_cap=0.9,
+)
+```
+详细参数参见 [DataFlow 的 MinerU 算子文档](https://github.com/OpenDCAI/DataFlow/blob/main/dataflow/operators/knowledge_cleaning/generate/mineru_operators.py)。
+<details>
+<summary>代码逻辑简介</summary>
+抽取流水线共六步：
+1. **PDF 合并**（`PDF_Merger`）：如果提供了多个 PDF，先合并为一个。
+2. **文档版面解析**（`FileOrURLToMarkdownConverterAPI`）：调用 MinerU API，生成结构化版面 JSON 和页面图片。
+3. **版面预处理**（`MinerU2LLMInputOperator`）：展平列表项并重新编号，生成 LLM 输入格式。
+4. **LLM 抽取**（`ChunkedPromptedGenerator`）：将版面 JSON 分块（每块最多 128k token），用 `QAExtractPrompt` 提示词批量调用 LLM，生成 XML 格式的问答对。
+5. **输出解析**（`LLMOutputParser`）：将 XML 响应解析为 JSONL，并将图片复制到 `vqa_images/`。
+6. **问答合并**（`QA_Merger`）：对于题目和答案分离的 PDF，根据章节标题和题目序号进行启发式匹配。可以设置一个strict_title_match参数，如果设置为True，会对章节��题进行严格匹配，否则会尝试提取标题中的中文/英文序号再匹配。
+</details>
+---
+## 第二步：数据清洗
+```bash
+python -m pipelines.curate_data \
+    --input_file ./output/raw_vqa.jsonl \
+    --api_url https://api.openai.com/v1 \
+    --model gpt-5-mini
+```
+输出保存为 `--input_file` 同目录下的 `curated_vqa.jsonl`。
+<details>
+<summary>代码逻辑简介</summary>
+共四步：
+**1. 切小题**
+将含多个独立小问的题目（如 (a)(b)(c)）拆分为独立条目，每个小题配上对应的答案和解析。question 或 answer+solution 均为空的条目会被丢弃。
+题目内的小问如果互相有联系（比如(b)需要(a)的结果），则不会拆分为独立条目。
+新增字段：`split_qa`
+**2. 判断题型**
+将每道题归类为以下之一：`Calculation`、`Proof`、`Explanation`、`Fill-in`、`Multiple-choice`、`Sketching`、`Other`。
+默认只保留 `Calculation`、`Fill-in`、`Multiple-choice`。可通过修改 `DataCurationPipeline.__init__` 中的 `filter_rules` 自定义保留范围。
+新增字段：`type`、`type_reason`
+**3. 抽取答案**
+从 `solution` 字段中抽取简洁答案并写入 `answer`。如 `answer` 已有内容则跳过（可在 `AnswerExtractionOperator` 中设置 `overwrite=True` 覆盖）。
+**4. 题目过滤**
+过滤掉不符合要求的条目，标准包括：
+- 必须是明确的考题，不能是示例、纯陈述或开放性讨论。
+- 答案必须直接回答问题。
+- 题目和答案须自洽完整，不能依赖外部引用或省略的上下文。
+新增字段：`filter_result`、`filter_reason`
+</details>
+---
+## 第三步：生成 CoT
+答题模型和评判模型可以使用不同的 API 端点和 API Key，这在答题模型是本地部署的开源 VLM（如通过 vLLM 部署的 Qwen3-VL）而评判模型是商业 API 时非常实用。
+使用 `--answer_api_key_env` / `--judge_api_key_env` 指定各自使用哪个环境变量作为 API Key（默认均为 `DF_API_KEY`）。
+```bash
+# 示例：本地 Qwen3-VL 生成答案，OpenAI 作为评判
+export VLLM_API_KEY="token-xxxx"   # 如果 vLLM server 不需要 key 可以不设
+export DF_API_KEY="sk-xxxx"
+python -m pipelines.generate_cot \
+    --input_file ./output/curated_vqa.jsonl \
+    --max_retries 5 \
+    --answer_api_url https://your-vllm-server/v1 \
+    --answer_model qwen3-vl-235b-thinking \
+    --answer_api_key_env VLLM_API_KEY \
+    --judge_api_url https://api.openai.com/v1 \
+    --judge_model gpt-5-mini \
+    --judge_api_key_env DF_API_KEY
+```
+输出保存为 `--input_file` 同目录下的 `curated_vqa_with_cot.jsonl`。
+<details>
+<summary>代码逻辑简介</summary>
+在最多 `max_retries` 轮中进行 Reject Sampling：
+**1. LLM 回答**（`VQAReasoningAnswerGenerator`）
+LLM 生成分步推理过程，结果存入 `generated_cot`。在 `RejectSamplingPipeline` 中设置 `skip_text_only=True` 可只处理包含图片的题目，`False` 则处理全部题目。
+**2. 清理 thinking 内容**
+从生成结果中删除 `<think>...</think>` 部分以降低验证成本。清理后的答案存入 `llm_short_answer`。假设模型输出格式为 `<think>THINK</think>ANSWER`或`THINK</think>ANSWER`。
+**3. LLM 核对**（`BenchDatasetEvaluatorQuestion`）
+将 `llm_short_answer` 与标准答案 `answer` 进行语义比较（数值允许 5% 误差）。答对的标记为 `answer_match_result = True`，后续轮次跳过。
+设置 `support_subquestions=True` 会逐个评估小题，只要有一道答错，整题的 `answer_match_result` 即为 `False`。
+评估统计（整体正确率、小题正确率）保存至 `./cot_cache/eval_results.jsonl`：
+```json
+{
+  "total_samples": 23584,
+  "matched_samples": 12281,
+  "accuracy": 0.521,
+  "total_subquestions": 26380,
+  "correct_subquestions": 13807,
+  "subquestion_accuracy": 0.523
+}
+```
+</details>
+---
+## 示例
+`examples/VQA/` 目录提供了示例 PDF 和输入 JSONL：
+```
+examples/VQA/
+├── vqa_extract_test.jsonl    # 第一步的示例输入
+├── questionextract_test.pdf  # 题目答案混排 PDF
+├── math_question.pdf         # 题目 PDF（分离式示例）
+└── math_answer.pdf           # 答案 PDF（分离式示例）
+```
+完整流水线示例：
+```bash
+# 第一步：抽取
+python -m pipelines.vqa_extract_optimized_pipeline \
+    --input_file ./examples/VQA/vqa_extract_test.jsonl \
+    --output_dir ./output \
+    --api_url https://generativelanguage.googleapis.com/v1beta/openai/ \
+    --model gemini-2.5-pro
+# 第二步：清洗
+python -m pipelines.curate_data \
+    --input_file ./output/raw_vqa.jsonl \
+    --api_url https://api.openai.com/v1 \
+    --model gpt-5-mini
+# 第三步：生成 CoT
+# 示例：本地 Qwen3-VL 生成答案，OpenAI 作为评判
+export VLLM_API_KEY="token-xxxx"   # 如果 vLLM server 不需要 key 可以不设
+export DF_API_KEY="sk-xxxx"
+python -m pipelines.generate_cot \
+    --input_file ./output/curated_vqa.jsonl \
+    --max_retries 5 \
+    --answer_api_url https://your-vllm-server/v1 \
+    --answer_model qwen3-vl-235b-thinking \
+    --answer_api_key_env VLLM_API_KEY \
+    --judge_api_url https://api.openai.com/v1 \
+    --judge_model gpt-5-mini \
+    --judge_api_key_env DF_API_KEY
+```
+## 提示
+目前的实现版本仅使用于跑小规模的示例。如果你想用我们的方法处理大规模的书籍，你应该会需要[断点续传](https://opendcai.github.io/DataFlow-Doc/zh/guide/resume/)和[分批推理](https://opendcai.github.io/DataFlow-Doc/en/guide/batch/)这两个feature。
+## 许可证
+本项目基于 [Apache License 2.0](LICENSE) 开源。

app.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import os
+import sys
+import re
+import shutil
+import tempfile
+import traceback
+import gradio as gr
+# Ensure the repo root is on the Python path
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+def run_curation(
+    input_file,
+    api_url: str,
+    api_key: str,
+    model_name: str,
+    max_workers: int,
+    progress=gr.Progress(track_tqdm=True),
+):
+    if input_file is None:
+        return None, "请先上传输入的 JSONL 文件。"
+    if not api_key.strip():
+        return None, "请填写 API Key。"
+    # Inject the key so DataFlow's APILLMServing_request picks it up
+    os.environ["DF_API_KEY"] = api_key.strip()
+    # Use a dedicated temp workspace so parallel runs don't collide
+    workspace = tempfile.mkdtemp(prefix="dataflow_vqa_")
+    cache_dir = os.path.join(workspace, "cache")
+    os.makedirs(cache_dir, exist_ok=True)
+    # curate_data.py hardcodes cache_path="./cache", so we work from workspace
+    original_cwd = os.getcwd()
+    try:
+        os.chdir(workspace)
+        # Late import after path & cwd are set up
+        from pipelines.curate_data import DataCurationPipeline
+        progress(0.05, desc="初始化 pipeline…")
+        pipeline = DataCurationPipeline(
+            input_file=input_file,
+            api_url=api_url.rstrip("/"),
+            model_name=model_name,
+            max_workers=int(max_workers),
+        )
+        pipeline.compile()
+        progress(0.15, desc="正在运行 pipeline（可能需要几分钟）…")
+        pipeline.forward()
+        # Locate the highest-numbered step file
+        step_files = [
+            f for f in os.listdir(cache_dir)
+            if re.match(r"curate_data_step\d+\.jsonl", f)
+        ]
+        if not step_files:
+            return None, "Pipeline 运行完成，但未找到输出文件。请检查日志。"
+        max_step = max(
+            int(re.findall(r"curate_data_step(\d+)\.jsonl", f)[0])
+            for f in step_files
+        )
+        output_path = os.path.join(cache_dir, f"curate_data_step{max_step}.jsonl")
+        # Copy to a stable temp file so Gradio can serve it
+        result_file = os.path.join(workspace, "curated_vqa.jsonl")
+        shutil.copy(output_path, result_file)
+        progress(1.0, desc="完成！")
+        return result_file, f"✅ 完成！共执行 {max_step} 步，结果已保存为 curated_vqa.jsonl。"
+    except Exception:
+        tb = traceback.format_exc()
+        return None, f"❌ 运行出错：\n```\n{tb}\n```"
+    finally:
+        os.chdir(original_cwd)
+# ── Gradio UI ──────────────────────────────────────────────────────────────────
+with gr.Blocks(
+    title="DataFlow-VQA · 数据清洗 Demo",
+    theme=gr.themes.Soft(),
+) as demo:
+    gr.Markdown(
+        """
+# 🔬 DataFlow-VQA — 数据清洗 Pipeline Demo
+将从 PDF 中提取的原始 VQA 数据（`raw_vqa.jsonl`）通过多步 LLM 清洗，输出高质量的 `curated_vqa.jsonl`。
+**清洗步骤：** 子问题拆分 → 题型分类过滤 → 答案提取 → 填空补全 → 文本清理 → QA 质量过滤
+> 注意：所有 LLM 调用均通过您提供的 API 完成，本 Space 不存储任何数据或密钥。
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 📥 输入")
+            input_file = gr.File(
+                label="上传输入 JSONL 文件（raw_vqa.jsonl）",
+                file_types=[".jsonl"],
+            )
+            gr.Markdown("### ⚙️ API 配置")
+            api_url = gr.Textbox(
+                label="API Base URL（不含 /chat/completions）",
+                value="https://api.openai.com/v1",
+                placeholder="https://api.openai.com/v1",
+            )
+            api_key = gr.Textbox(
+                label="API Key",
+                placeholder="sk-...",
+                type="password",
+            )
+            model_name = gr.Textbox(
+                label="模型名称",
+                value="gpt-4o-mini",
+                placeholder="gpt-4o-mini / gemini-2.0-flash / deepseek-chat …",
+            )
+            max_workers = gr.Slider(
+                label="并发 Worker 数量",
+                minimum=1,
+                maximum=50,
+                value=5,
+                step=1,
+                info="HF Spaces 免费版资源有限，建议不超过 10",
+            )
+            run_btn = gr.Button("▶ 开始清洗", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            gr.Markdown("### 📤 输出")
+            status_box = gr.Textbox(
+                label="运行状态",
+                interactive=False,
+                lines=6,
+                placeholder="点击「开始清洗」后，状态信息将显示在这里…",
+            )
+            output_file = gr.File(
+                label="下载清洗结果（curated_vqa.jsonl）",
+                interactive=False,
+            )
+    gr.Markdown(
+        """
+---
+**输入格式**：每行一个 JSON 对象，需包含 `question`、`answer`、`solution` 字段。
+**支持的 API**：任何 OpenAI 兼容接口，包括 OpenAI、Google Gemini（via proxy）、DeepSeek、vLLM 等。
+**项目地址**：[OpenDCAI/DataFlow-VQA](https://github.com/OpenDCAI/DataFlow-VQA)
+        """
+    )
+    run_btn.click(
+        fn=run_curation,
+        inputs=[input_file, api_url, api_key, model_name, max_workers],
+        outputs=[output_file, status_box],
+    )
+if __name__ == "__main__":
+    demo.launch()

examples/VQA/vqa_extract_test.jsonl ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ {"input_pdf_paths": "./examples/VQA/questionextract_test.pdf", "name": "math1"}
2	+ {"input_pdf_paths": ["./examples/VQA/math_question.pdf", "./examples/VQA/math_answer.pdf"], "name": "math2"}

operators/answer_extractor.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+import pandas as pd
+from typing import Union
+@OPERATOR_REGISTRY.register()
+class AnswerExtractionOperator(OperatorABC):
+    def __init__(self, llm_serving: Union[None, object] = None, overwrite: bool = False):
+        self.logger = get_logger()
+        self.llm_serving = llm_serving
+        self.overwrite = overwrite
+        self.system_prompt = "You are a professional question answering system. You will be given a question with corresponding solution. Extract a concise and accurate answer from the provided solution. Output only the answer without any additional text."
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于从解答中提取答案，读取解答字段并调用LLM提取答案。"
+                "输入参数：\n"
+                "- input_solution_key：解答字段名，默认为'solution'\n"
+                "- output_key：答案字段名，默认为'answer'\n"
+                "- overwrite：是否覆盖已有答案，默认为False\n"
+                "输出参数：\n"
+                "- output_key：提取的答案"
+            )
+        elif lang == "en":
+            return (
+                "This operator extracts answers from solutions, reading from the solution field and using LLM to extract answers."
+                "Input Parameters:\n"
+                "- input_solution_key: Solution field name, default 'solution'\n"
+                "- output_key: Answer field name, default 'answer'\n"
+                "- overwrite: Whether to overwrite existing answers, default False\n"
+                "Output Parameters:\n"
+                "- output_key: Extracted answer"
+            )
+        else:
+            return "AnswerExtractionOperator extracts answers from solutions using LLM."
+    def run(self, storage: DataFlowStorage, input_question_key: str = "question", input_solution_key: str = "solution", output_key: str = "answer"):
+        dataframe = storage.read("dataframe")
+        if input_solution_key not in dataframe.columns:
+            raise ValueError(f"input_solution_key: {input_solution_key} not found in dataframe columns.")
+        # -----------------------------------
+        # 一套统一的空值判断逻辑
+        # -----------------------------------
+        def _is_valid(x, *, empty_ok=False):
+            """
+            empty_ok=False: 用来判断 solution 是否有效（空白→无效）
+            empty_ok=True: 用来判断 output 是否“空”（空白→空）
+            """
+            if x is None:
+                return empty_ok
+            if isinstance(x, float) and pd.isna(x):
+                return empty_ok
+            if isinstance(x, str) and x.strip() == "":
+                return empty_ok
+            return not empty_ok
+        # solution 必须有效（非空、非空白）
+        mask = dataframe[input_solution_key].apply(lambda x: _is_valid(x, empty_ok=False))
+        # 若 overwrite=False，则 output_key 为空（空白也算）才处理
+        if not self.overwrite and output_key in dataframe.columns:
+            mask = mask & dataframe[output_key].apply(lambda x: _is_valid(x, empty_ok=True))
+        # 收集有效 solutions
+        solutions = dataframe.loc[mask, input_solution_key].tolist()
+        questions = dataframe.loc[mask, input_question_key].tolist()
+        # 调用 LLM
+        if self.llm_serving:
+            prompts = [
+                self.system_prompt + f"\n\nQuestion: {q}\nSolution: {s}\nNow extract the answer."
+                for q, s in zip(questions, solutions)
+            ]
+            answers = self.llm_serving.generate_from_input(prompts)
+        else:
+            answers = solutions
+        # 写回
+        dataframe.loc[mask, output_key] = answers
+        output_file = storage.write(dataframe)
+        self.logger.info(f"Extracted answers saved to {output_file}")
+        return [output_key]

operators/bench_evaluate.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import os
+import sys
+# 添加父一级目录到 sys.path（上一级）
+parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+if parent_dir not in sys.path:
+    sys.path.insert(0, parent_dir)
+from dataflow.utils.reasoning.AnswerExtraction import StringCleaner, UnitTextManager, AnswerExtractor
+from prompts.bench_evaluate import AnswerJudgePromptQuestion, AnswerJudgeMultipleQuestionsPrompt
+from dataflow.core.prompt import DIYPromptABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.core import LLMServingABC
+from dataflow.core import OperatorABC
+from math_verify import parse, verify
+from dataflow import get_logger
+from typing import Literal
+import pandas as pd
+import numpy as np
+import time
+import re
+import json
+import json5
+@OPERATOR_REGISTRY.register()
+class BenchDatasetEvaluatorQuestion(OperatorABC):
+    def __init__(self,
+                eval_result_path: str = None,
+                compare_method: Literal["match", "semantic"] = "match",
+                system_prompt: str = "You are a helpful assistant specialized in evaluating answer correctness.",
+                llm_serving: LLMServingABC = None,
+                prompt_template: DIYPromptABC = None,
+                support_subquestions: bool = False,
+                skip_true: bool = False, # 是否跳过已经验证过为True的样本
+                ):
+        if eval_result_path is None:
+            timestamp = int(time.time())
+            eval_result_path = f"result_bencheval/BenchDatasetEvaluator_result_{timestamp}.json"
+        self.eval_result_path = eval_result_path
+        self.compare_method = compare_method
+        self.empty_responses_count = 0  # 添加空响应计数器
+        if compare_method == "match":
+            self.compare = self.math_verify_compare
+            unit_manager = UnitTextManager()
+            string_cleaner = StringCleaner(unit_manager)
+            self.answer_extractor = AnswerExtractor(string_cleaner)
+        else:
+            if prompt_template is None:
+                prompt_template = AnswerJudgePromptQuestion() if not support_subquestions else AnswerJudgeMultipleQuestionsPrompt()
+            self.prompt_template = prompt_template
+            self.system_prompt = system_prompt
+            self.llm_serving = llm_serving
+            self.support_subquestions = support_subquestions
+            self.skip_true = skip_true
+        self.logger = get_logger()
+    def math_verify_compare(self, answer, ground_truth):
+        try:
+            return verify(parse(str(ground_truth)), parse(str(answer)))
+        except:
+            try:
+                return verify(parse(ground_truth), parse(answer))
+            except:
+                return False
+    def ResolveResponse(self, response):
+        # 检查空响应
+        if not self.support_subquestions:
+            if response is None or (isinstance(response, str) and response.strip() == ''):
+                self.empty_responses_count += 1
+                return False
+            try:
+                pattern = re.compile(r'"judgement_result"\s*:\s*(true|false)', re.IGNORECASE)
+                match = pattern.search(response)
+                result_value = None
+                if match:
+                    result_value = match.group(1).lower()
+                else:
+                    # 备用解析逻辑，检查响应中是否包含true或false
+                    if "true" in response.lower():
+                        result_value = "true"
+                    else:
+                        result_value = "false"
+                if result_value == "true":
+                    return True
+                else:
+                    return False
+            except Exception as e:
+                self.logger.error(f"Response format error: {response}. Error: {e}")
+                return False
+        if self.support_subquestions:
+            # 如果支持子问题，假设response是一个列表, 返回正确的数量/总数
+            correct_num = 0
+            total_num = 0
+            try:
+                response = json5.loads(response, strict=False)  # 使用json5解析，允许更宽松的格式
+                judgement = response.get("judgement", [])
+            except Exception as e:
+                self.logger.error(f"Response JSON parse error: {response}. Error: {e}")
+                self.empty_responses_count += 1
+                return "0/0"
+            for resp in judgement:
+                if isinstance(resp, bool):
+                    if resp is True:
+                        correct_num += 1
+                        total_num += 1
+                    elif resp is False:
+                        total_num += 1
+                    elif resp.lower() == "empty":
+                        continue  # 不计入总数
+                elif isinstance(resp, str):
+                    if resp.lower() == "true":
+                        correct_num += 1
+                        total_num += 1
+                    elif resp.lower() == "false":
+                        total_num += 1
+                    elif resp.lower() == "empty":
+                        continue  # 不计入总数
+            return f"{correct_num}/{total_num}"
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于对比预测答案与标准答案的匹配度，支持两种评估模式：\n\n"
+                "1. 字符串匹配（match）：使用数学验证方法比较答案，适用于有明确答案的问题\n"
+                "2. 语义匹配（semantic）：使用LLM评估答案的语义相似度，适用于开放性问题\n\n"
+                "输入参数：\n"
+                "- input_test_answer_key：预测答案字段名\n"
+                "- input_gt_answer_key：标准答案字段名\n"
+                "- input_question_key：问题字段名（语义匹配模式下必需）\n"
+                "- compare_method：比较方法（match/semantic）\n\n"
+                "输出参数：\n"
+                "- answer_match_result：匹配结果（True/False）\n"
+                "- 统计结果将保存到指定的eval_result_path路径\n"
+            )
+        elif lang == "en":
+            return (
+                "This operator compares predicted answers against ground truth using two evaluation modes:\n\n"
+                "1. String Matching (match): Uses mathematical verification to compare answers, suitable for questions with definitive answers\n"
+                "2. Semantic Matching (semantic): Uses LLM to evaluate semantic similarity, suitable for open-ended questions\n\n"
+                "Input Parameters:\n"
+                "- input_test_answer_key: Predicted answer field\n"
+                "- input_gt_answer_key: Ground truth field\n"
+                "- input_question_key: Question field (required for semantic mode)\n"
+                "- compare_method: Comparison method (match/semantic)\n\n"
+                "Output Parameters:\n"
+                "- answer_match_result: Matching result (True/False)\n"
+                "- Statistics will be saved to the specified eval_result_path\n"
+            )
+        else:
+            return "BenchEvaluator performs answer validation using string matching or semantic comparison"
+    def check_column(self, required_columns: list[str], dataframe: pd.DataFrame):
+        for column in required_columns:
+            if column not in dataframe.columns:
+                self.logger.error(f"Required column '{column}' not found in dataframe")
+                return False
+        return True
+    def statistic(self, file_name_prefix: str, dataframe: pd.DataFrame, compare_method: Literal["match", "semantic"]):
+        total_samples = len(dataframe)
+        valid_samples = len(dataframe) - self.empty_responses_count
+        matched_samples = sum(dataframe['answer_match_result'])
+        accuracy = matched_samples / valid_samples if valid_samples > 0 else 0
+        # 创建统计信息字典
+        stats = {
+            "bench_name_or_prefix": file_name_prefix,
+            "total_samples": total_samples,
+            "valid_samples": valid_samples,
+            "matched_samples": matched_samples,
+            "accuracy": float(accuracy),  # 确保可以被JSON序列化
+            "empty_responses_count": self.empty_responses_count,
+            "compare_method": compare_method
+        }
+        if self.support_subquestions:
+            total_subquestions = dataframe['total_subquestions'].sum()
+            correct_subquestions = dataframe['correct_answer_num'].sum()
+            subquestion_accuracy = correct_subquestions / total_subquestions if total_subquestions > 0 else 0
+            stats.update({
+                "total_subquestions": int(total_subquestions),
+                "correct_subquestions": int(correct_subquestions),
+                "subquestion_accuracy": float(subquestion_accuracy)
+            })
+        # 将字典转换为DataFrame
+        stats_df = pd.DataFrame([stats])
+        # 直接将统计信息写入到self.eval_result_path
+        os.makedirs(os.path.dirname(self.eval_result_path), exist_ok=True)
+        stats_df.to_json(self.eval_result_path, orient="records", force_ascii=False, indent=2)
+        self.logger.success(f"Statistics saved to {self.eval_result_path}")
+        return stats_df
+    def run(
+            self,
+            storage:DataFlowStorage,
+            input_test_answer_key: str = "generated_cot",
+            input_gt_answer_key: str = "golden_answer",
+            input_question_key: str = None,
+            ) -> list:
+        self.test_answer_key = input_test_answer_key
+        self.gt_answer_key = input_gt_answer_key
+        self.question_key = input_question_key
+        dataframe = storage.read("dataframe")
+        if 'answer_match_result' not in dataframe.columns:
+            dataframe['answer_match_result'] = False
+        answers = dataframe[self.test_answer_key]
+        ground_truths = dataframe[self.gt_answer_key]
+        if self.compare_method == "match":
+            if self.check_column(
+                required_columns=[input_test_answer_key,input_gt_answer_key],
+                dataframe=dataframe
+            ) is False:
+                return required_columns
+            for i in range(len(answers)):
+                final_answer =  self.answer_extractor.extract_answer(answers[i], None)
+                if self.compare(final_answer, ground_truths[i]):
+                    dataframe.at[i, 'answer_match_result'] = True
+                else:
+                    dataframe.at[i, 'answer_match_result'] = False
+            output_file = storage.write(dataframe)
+            # 生成统计信息并直接写入JSON文件
+            stats = self.statistic(storage.file_name_prefix, dataframe, self.compare_method)
+            return [self.test_answer_key, self.gt_answer_key, 'answer_match_result']
+        else:
+            if self.check_column(
+                required_columns=[input_test_answer_key,input_gt_answer_key, input_question_key],
+                dataframe=dataframe
+            ) is False:
+                return required_columns
+            empty_reference_mask = dataframe[input_gt_answer_key].isna() | (dataframe[input_gt_answer_key] == '')
+            if self.skip_true:
+                empty_reference_mask = empty_reference_mask | (dataframe['answer_match_result'] == True)
+            skipped_rows = dataframe[empty_reference_mask]
+            valid_rows = dataframe[~empty_reference_mask]
+            skipped_count = len(skipped_rows)
+            if len(valid_rows) == 0 and not self.skip_true:
+                self.logger.warning("No valid samples with reference answers found. All samples skipped.")
+                if self.keep_all_samples:
+                    output_file = storage.write(dataframe)  # 保留所有行，但answer_match_result都为False
+                else:
+                    output_file = storage.write(pd.DataFrame(columns=dataframe.columns))  # 不保留任何行
+                self.logger.info(f"Dataframe saved to {output_file}. Skipped {skipped_count} samples due to missing reference answers.")
+                return required_columns + ['answer_match_result']
+            # 只对有参考答案的行构建提示词并调用LLM
+            inputs = [self.prompt_template.build_prompt(
+                question=row[input_question_key],
+                answer=row[input_test_answer_key],
+                reference_answer=row[input_gt_answer_key]
+            ) for _, row in valid_rows.iterrows()]
+            responses = self.llm_serving.generate_from_input(user_inputs=inputs, system_prompt=self.system_prompt)
+            # if self.support_subquestions:
+            #     # 每个response是一个列表，连接一个长列表，比如[["true", "false"], ["true"]] -> ["true", "false", "true"]
+            #     responses = [item for sublist in responses for item in sublist]
+            results = [self.ResolveResponse(response) for response in responses]
+            # 创建结果掩码，与valid_rows长度相同
+            result_mask = np.array(results, dtype=bool)
+            # 更新有效行的answer_match_result
+            valid_indices = valid_rows.index
+            if not self.support_subquestions:
+                for i, idx in enumerate(valid_indices):
+                    dataframe.at[idx, 'answer_match_result'] = results[i]
+            else:
+                for i, idx in enumerate(valid_indices):
+                    correct_answer_num = int(results[i].split('/')[0])
+                    total_subquestions = int(results[i].split('/')[1])
+                    dataframe.at[idx, 'correct_answer_num'] = correct_answer_num
+                    dataframe.at[idx, 'total_subquestions'] = total_subquestions
+                    dataframe.at[idx, 'answer_match_result'] = (correct_answer_num == total_subquestions) and (total_subquestions > 0)   # 全对为True，否则为False
+                    dataframe.at[idx, 'response_evaluation'] = responses[i]  # 保存LLM的原始响应内容
+            output_file = storage.write(dataframe)
+            # 生成统计信息并直接写入JSON文件
+            stats = self.statistic(storage.file_name_prefix, dataframe, self.compare_method)
+            # 重置空响应计数器
+            self.empty_responses_count = 0
+            return [input_test_answer_key, input_gt_answer_key, input_question_key, 'answer_match_result']

operators/pdf2vqa/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .mineru_to_llm_input_operator import MinerU2LLMInputOperator
+from .llm_output_parser import LLMOutputParser
+from .qa_merger import QA_Merger
+from .pdf_merger import PDF_Merger

operators/pdf2vqa/llm_output_parser.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import os
+import json
+import re
+import shutil
+from pathlib import Path
+from typing import Literal
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.storage import DataFlowStorage
+from dataflow import get_logger
+@OPERATOR_REGISTRY.register()
+class LLMOutputParser(OperatorABC):
+    def __init__(self,
+                 output_dir,
+                 intermediate_dir: str = "intermediate",
+                 ):
+        self.logger = get_logger()
+        self.output_dir = output_dir
+        self.intermediate_dir = intermediate_dir
+    @staticmethod
+    def get_desc(lang: str = "zh") -> str:
+        if lang == 'zh':
+            return (
+                "LLM输出解析算子。"
+                "将LLM生成的包含题目和答案ID的响应文本，"
+                "转换为结构化的QA列表，并复制相关图片到输出目录。"
+            )
+        else:
+            return (
+                "LLM output parsing operator."
+                "Converts LLM-generated response text containing question and answer IDs"
+                "into a structured QA list and copies related images to the output directory."
+            )
+    def _id_to_text(self, input_ids, input_json, image_prefix="images"):
+        texts = []
+        id_list = input_ids.replace(' ', '').split(',')
+        for id in id_list:
+            try:
+                int(id)
+            except Exception:
+                continue
+            if int(id) < len(input_json):
+                try:
+                    item = input_json[int(id)]
+                except Exception:
+                    continue
+                if 'text' in item:
+                    texts.append(item['text'])
+                elif 'table_body' in item:
+                    texts.append(item['table_body'])
+                elif 'img_path' in item:
+                    try:
+                        img_path = item.get('img_path', '')
+                        img_name = os.path.basename(img_path)
+                        new_path = f"{image_prefix}/{img_name}"
+                        texts.append(f"![{' '.join(item.get('image_caption','image'))}]({new_path})")
+                    except Exception:
+                        pass
+                elif item.get('type','') == 'list':
+                    if item['sub_type'] == 'text':
+                        try:
+                            texts.append(input_json[int(id)]['list_items'].pop(0))
+                        except Exception:
+                            pass
+        return '\n'.join(texts)
+    def _convert_response(self, input_response, input_json_path, image_prefix="images"):
+        qa_list = []
+        with open(input_json_path, 'r', encoding='utf-8') as infile:
+            input_json = list(json.load(infile))
+        # 提取title
+        for chapter_block in re.findall(r'<chapter>(.*?)</chapter>', input_response, flags=re.DOTALL):
+            title = re.search(r'<title>(.*?)</title>', chapter_block, flags=re.DOTALL)
+            if title:
+                chapter_title = self._id_to_text(title.group(1).strip(), input_json, image_prefix)
+            else:
+                chapter_title = ""
+            # 找出所有 qa_pair 块
+            for pair in re.findall(r'<qa_pair>(.*?)</qa_pair>', chapter_block, flags=re.DOTALL):
+                # 提取 question 部分
+                q_match = re.search(r'<question>(.*?)</question>', pair, flags=re.DOTALL)
+                # 提取 answer 部分
+                a_match = re.search(r'<answer>(.*?)</answer>', pair, flags=re.DOTALL)
+                # 提取solution部分
+                s_match = re.search(r'<solution>(.*?)</solution>', pair, flags=re.DOTALL)
+                # 提取label
+                label_match = re.search(r'<label>(.*?)</label>', pair, flags=re.DOTALL)
+                if not ((q_match and label_match) or (a_match and label_match) or (s_match and label_match)):
+                    continue
+                label = label_match.group(1).strip()
+                qa_list.append({
+                    'question': self._id_to_text(q_match.group(1).strip(), input_json, image_prefix) if q_match else "",
+                    'answer': a_match.group(1).strip() if a_match else "",
+                    'solution': self._id_to_text(s_match.group(1).strip(), input_json, image_prefix) if s_match else "",
+                    'label': label,
+                    'chapter_title': chapter_title
+                })
+        return qa_list
+    def run(self, storage: DataFlowStorage,
+            input_response_path_key,
+            input_converted_layout_path_key,
+            input_name_key,
+            output_qalist_path_key,
+            ):
+        dataframe = storage.read("dataframe")
+        # Response 转换
+        for idx, row in dataframe.iterrows():
+            converted_json_path = row[input_converted_layout_path_key]
+            response = Path(row[input_response_path_key]).read_text(encoding='utf-8')
+            name = row[input_name_key]
+            # 🚨 罪魁祸首在这里：它把 name（比如 math1）强行拼到了前缀里
+            # image_prefix = os.path.join(name, f"vqa_images")
+            # ✅ 修复 1：Markdown 的相对路径只需要文件夹名即可
+            image_prefix = "vqa_images"
+            # 这里把错误的带 math1/ 的前缀传给了内容解析器，写进了 JSON 和 MD 里
+            qa_list = self._convert_response(response, converted_json_path, image_prefix)
+            output_qalist_path = os.path.join(self.output_dir, name, f"extracted_vqa.jsonl")
+            os.makedirs(os.path.dirname(output_qalist_path), exist_ok=True)
+            with open(output_qalist_path, 'w', encoding='utf-8') as outfile:
+                for qa in qa_list:
+                    json.dump(qa, outfile, ensure_ascii=False)
+                    outfile.write('\n')
+            # 复制图片
+            src_dir = os.path.dirname(converted_json_path)
+            src_images = os.path.join(src_dir, 'vlm', 'images')
+            if not os.path.exists(src_images):
+                src_images = os.path.join(src_dir, 'images')
+            if not os.path.exists(src_images):
+                self.logger.warning(f"Images directory {src_images} not found, skipping image copy (PDF may contain no images).")
+            else:
+                dst_images = os.path.join(self.output_dir, name, image_prefix)
+                try:
+                    shutil.copytree(src_images, dst_images)
+                except Exception as e:
+                    self.logger.warning(f"Failed to copy images from {src_images} to {dst_images}: {e}")
+            dataframe.loc[idx, output_qalist_path_key] = output_qalist_path
+        storage.write(dataframe)

operators/pdf2vqa/mineru_to_llm_input_operator.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import json
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.storage import DataFlowStorage
+from pathlib import Path
+@OPERATOR_REGISTRY.register()
+class MinerU2LLMInputOperator(OperatorABC):
+    def __init__(self):
+        pass
+    @staticmethod
+    def get_desc(lang: str = "zh") -> str:
+        if lang == 'zh':
+            return (
+                "MinerU格式转换为LLM输入格式算子。"
+                "将MinerU生成的内容列表JSON文件转换为适合LLM处理的格式，"
+                "包括展平列表项并重新编号。"
+            )
+        else:
+            return (
+                "Convert MinerU format to LLM input format operator."
+                "Transforms the content list JSON file generated by MinerU into a format suitable for LLM processing,"
+                "including flattening list items and re-indexing."
+            )
+    def _convert_json(self, input_file, output_file):
+        with open(input_file, 'r', encoding="utf-8") as infile:
+            data = list(json.load(infile))
+        new_data = []
+        id = 0
+        for item in data:
+            item['id'] = id
+            item.pop('bbox', None)
+            item.pop('page_idx', None)
+            if item.get('type','') == 'list':
+                if item['sub_type'] == 'text':
+                    for idx, list_item in enumerate(item.get('list_items', [])):
+                        new_item = {
+                            'type': 'text',
+                            'text': list_item,
+                            'id': id + idx,
+                        }
+                        new_data.append(new_item)
+                    id += len(item.get('list_items', []))
+            else:
+                new_data.append(item)
+                id += 1
+        with open(output_file, 'w', encoding='utf-8') as outfile:
+            json.dump(new_data, outfile, ensure_ascii=False)
+    def run(self, storage: DataFlowStorage,
+            input_markdown_path_key,
+            output_converted_layout_key,
+            ):
+        dataframe = storage.read("dataframe")
+        for index, row in dataframe.iterrows():
+            md_path = Path(row[input_markdown_path_key])
+            try:
+                input_json_path = list(md_path.parent.glob("*_content_list.json"))[0]
+            except Exception:
+                raise ValueError("No _content_list.json file found in the api result. There might be an error with the Mineru api.")
+            converted_path = str(input_json_path).replace('.json', '_converted.json')
+            self._convert_json(input_json_path, converted_path)
+            dataframe.at[index, output_converted_layout_key] = converted_path
+            with open(converted_path, 'r', encoding='utf-8') as infile:
+                data = json.load(infile)
+                assert isinstance(data, list), f"Expected list, got {type(data)} for {input_json_path}"
+        storage.write(dataframe)

operators/pdf2vqa/pdf_merger.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import os
+from pypdf import PdfWriter
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.storage import DataFlowStorage
+@OPERATOR_REGISTRY.register()
+class PDF_Merger(OperatorABC):
+    def __init__(self, output_dir: str):
+        """
+        初始化 PDF 合并算子。
+        :param output_dir: 合并后 PDF 文件的存放根目录
+        """
+        self.output_dir = output_dir
+        if not os.path.exists(self.output_dir):
+            os.makedirs(self.output_dir, exist_ok=True)
+    @staticmethod
+    def get_desc(lang: str = "zh") -> str:
+        if lang == 'zh':
+            return (
+                "PDF 文件合并算子。"
+                "输入 PDF 路径列表，按顺序合并为一个 PDF 文件，"
+                "并保存到指定目录。"
+            )
+        else:
+            return (
+                "PDF merging operator."
+                "Takes a list of PDF paths, merges them in order into a single PDF,"
+                "and saves it to the specified directory."
+            )
+    def run(self,
+            storage: DataFlowStorage,
+            input_pdf_list_key: str,
+            input_name_key: str,
+            output_pdf_path_key: str
+            ):
+        """
+        执行合并逻辑。
+        :param input_pdf_list_key: DataFrame 中存放 PDF 路径列表 (str或list[str]) 的列名
+        :param input_name_key: DataFrame 中用于命名的列名（如文件名或ID）
+        :param output_pdf_path_key: 合并后结果路径存入的列名
+        """
+        dataframe = storage.read("dataframe")
+        for idx, row in dataframe.iterrows():
+            pdf_paths = row[input_pdf_list_key]
+            if isinstance(pdf_paths, str):
+                pdf_paths = [pdf_paths]
+            name = row[input_name_key]
+            # 构建输出路径：output_dir/name/merged.pdf
+            save_dir = os.path.join(self.output_dir, str(name))
+            os.makedirs(save_dir, exist_ok=True)
+            output_path = os.path.join(save_dir, f"{name}_merged.pdf")
+            try:
+                merger = PdfWriter()
+                valid_count = 0
+                for path in pdf_paths:
+                    if os.path.exists(path):
+                        merger.append(path)
+                        valid_count += 1
+                if valid_count > 0:
+                    with open(output_path, "wb") as f:
+                        merger.write(f)
+                    merger.close()
+                    # 将结果写回 dataframe
+                    dataframe.loc[idx, output_pdf_path_key] = output_path
+                else:
+                    dataframe.loc[idx, output_pdf_path_key] = None
+            except Exception as e:
+                print(f"Error merging PDFs for {name}: {e}")
+                dataframe.loc[idx, output_pdf_path_key] = None
+        storage.write(dataframe)

operators/pdf2vqa/qa_merger.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import os
+import json
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.storage import DataFlowStorage
+from utils.format_utils import merge_qa_pair, jsonl_to_md
+import re
+@OPERATOR_REGISTRY.register()
+class QA_Merger(OperatorABC):
+    def __init__(self, output_dir, strict_title_match=False):
+        self.output_dir = output_dir
+        self.strict_title_match = strict_title_match
+    @staticmethod
+    def get_desc(lang: str = "zh") -> str:
+        if lang == 'zh':
+            return (
+                "QA对合并算子。"
+                "将问题和答案的QA列表进行合并，生成最终的QA对文件，"
+                "并转换为Markdown格式。"
+            )
+        else:
+            return (
+                "QA pair merging operator."
+                "Merges question and answer QA lists to generate final QA pair files,"
+                "and converts them to Markdown format."
+            )
+    def run(self, storage: DataFlowStorage,
+            input_qalist_path_key,
+            input_name_key,
+            output_merged_qalist_path_key,
+            output_merged_md_path_key,
+            output_qa_item_key="qa_item"  # 新增：展开后的 QA 内容列名
+            ):
+        dataframe = storage.read("dataframe")
+        # 为了能存储 list 对象，先初始化该列为 object 类型
+        dataframe[output_qa_item_key] = None
+        dataframe[output_qa_item_key] = dataframe[output_qa_item_key].astype(object)
+        for idx, row in dataframe.iterrows():
+            qa_list_path = row[input_qalist_path_key]
+            name = row[input_name_key]
+            output_merged_qalist_path = os.path.join(self.output_dir, name, "merged_qa_pairs.jsonl")
+            merge_qa_pair(qa_list_path, output_merged_qalist_path, strict_title_match=self.strict_title_match)
+            output_merged_md_path = os.path.join(self.output_dir, name, "merged_qa_pairs.md")
+            jsonl_to_md(output_merged_qalist_path, output_merged_md_path)
+            qa_pairs = []
+            if os.path.exists(output_merged_qalist_path):
+                with open(output_merged_qalist_path, 'r', encoding='utf-8') as f:
+                    qa_pairs = [json.loads(line) for line in f]
+            dataframe.at[idx, output_qa_item_key] = qa_pairs
+            dataframe.loc[idx, output_merged_qalist_path_key] = output_merged_qalist_path
+            dataframe.loc[idx, output_merged_md_path_key] = output_merged_md_path
+        dataframe = dataframe.explode(output_qa_item_key).reset_index(drop=True)
+        # 汇总jsonl中的图片路径需要将 ![alt](path) 中的 path 替换为 name/path
+        def fix_image_paths(row):
+            qa_item = row[output_qa_item_key]
+            name_val = str(row[input_name_key])
+            if isinstance(qa_item, dict):
+                keys_to_check = ["question", "answer", "solution"]
+                for key in keys_to_check:
+                    if key in qa_item and isinstance(qa_item[key], str):
+                        qa_item[key] = re.sub(
+                            r'!\[(.*?)\]\((.*?)\)',
+                            lambda m: f"![{m.group(1)}]({os.path.join(name_val, m.group(2))})",
+                            qa_item[key]
+                        )
+            return qa_item
+        dataframe[output_qa_item_key] = dataframe.apply(fix_image_paths, axis=1)
+        storage.write(dataframe)

operators/question_answer_clean.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import pandas as pd
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow import get_logger
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.core import OperatorABC
+from dataflow.core import LLMServingABC
+import re
+@OPERATOR_REGISTRY.register()
+class LLMTextCleanerOperator(OperatorABC):
+    def __init__(
+            self,
+            llm_serving: LLMServingABC,
+            prompt_template,
+            max_batch_size: int = 32
+        ):
+        self.logger = get_logger()
+        self.llm_serving = llm_serving
+        self.prompt_template = prompt_template
+        self.max_batch_size = max_batch_size
+        if prompt_template is None:
+            raise ValueError("prompt_template cannot be None")
+    def apply_deletions(self, original_text, deletion_output):
+        """从原始文本中删除指定片段"""
+        if not deletion_output or deletion_output.strip() == "NONE":
+            return original_text
+        # 按 || 分割片段
+        fragments = [frag.strip() for frag in deletion_output.split("||") if frag.strip()]
+        # 按长度降序排序，避免短串误删长串的一部分
+        fragments = sorted(fragments, key=len, reverse=True)
+        cleaned = original_text
+        for frag in fragments:
+            cleaned = cleaned.replace(frag, "", 1)  # 只删除一次
+        return cleaned
+    def run(
+            self,
+            storage: DataFlowStorage,
+            output_key: str = "cleaned_dataframe",
+            question_column: str = "question",
+            answer_column: str = "answer",
+            **input_keys
+        ):
+        self.storage: DataFlowStorage = storage
+        self.output_key = output_key
+        self.question_column = question_column
+        self.answer_column = answer_column
+        self.logger.info("Running LLMTextCleanerOperator...")
+        dataframe = storage.read('dataframe')
+        self.logger.info(f"Loading dataframe, number of rows: {len(dataframe)}")
+        if len(dataframe) == 0:
+            self.logger.warning("No data to process")
+            output_file = storage.write(dataframe)
+            return output_key
+        question_prompts = []
+        answer_prompts = []
+        valid_indices = []
+        for idx, row in dataframe.iterrows():
+            question = str(row.get(question_column, ""))
+            answer = str(row.get(answer_column, ""))
+            q_prompt = self.prompt_template.build_question_prompt(question)
+            a_prompt = self.prompt_template.build_answer_prompt(answer)
+            question_prompts.append(q_prompt)
+            answer_prompts.append(a_prompt)
+            valid_indices.append(idx)
+        self.logger.info(f"Prepared {len(question_prompts)} question prompts and {len(answer_prompts)} answer prompts")
+        question_deletion_outputs = self.llm_serving.generate_from_input(question_prompts)
+        self.logger.info("Completed question cleaning prompts processing")
+        answer_deletion_outputs = self.llm_serving.generate_from_input(answer_prompts)
+        self.logger.info("Completed answer cleaning prompts processing")
+        cleaned_questions = []
+        cleaned_answers = []
+        for i in range(len(question_deletion_outputs)):
+            original_question = str(dataframe.iloc[i][question_column])
+            original_answer = str(dataframe.iloc[i][answer_column])
+            cleaned_q = self.apply_deletions(original_question, question_deletion_outputs[i])
+            cleaned_a = self.apply_deletions(original_answer, answer_deletion_outputs[i])
+            cleaned_questions.append(cleaned_q.strip())
+            cleaned_answers.append(cleaned_a.strip())
+        result_dataframe = dataframe.copy()
+        result_dataframe[question_column] = cleaned_questions
+        result_dataframe[answer_column] = cleaned_answers
+        output_file = storage.write(result_dataframe)
+        self.logger.info(f"Cleaning completed, processed {len(result_dataframe)} rows")
+        return output_key

operators/question_refiner.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import pandas as pd
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow import get_logger
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.core import OperatorABC
+from dataflow.core import LLMServingABC
+@OPERATOR_REGISTRY.register()
+class AddMissingBlankOperator(OperatorABC):
+    def __init__(
+            self,
+            llm_serving: LLMServingABC,
+            prompt_template,
+        ):
+        self.logger = get_logger()
+        self.llm_serving = llm_serving
+        self.prompt_template = prompt_template
+        if prompt_template is None:
+            raise ValueError("prompt_template cannot be None")
+    def run(
+            self,
+            storage: DataFlowStorage,
+            output_key: str = "question",
+            **input_keys
+        ):
+        self.storage: DataFlowStorage = storage
+        self.output_key = output_key
+        self.logger.info("Running AddMissingBlankOperator...")
+        self.input_keys = input_keys
+        need_fields = set(input_keys.keys())
+    # Load the raw dataframe from the input file
+        dataframe = storage.read('dataframe')
+        self.logger.info(f"Loading, number of rows: {len(dataframe)}")
+        llm_inputs = []
+        # Only process rows where type == "fill-in"
+        if 'type' not in dataframe.columns:
+            self.logger.warning("No 'type' column found, skipping LLM generation.")
+            generated_outputs = []
+        else:
+            mask = dataframe['type'] == "Fill-in"
+            indices = dataframe.index[mask].tolist()
+            if not indices:
+                self.logger.info("No rows with type=='Fill-in' to process.")
+                generated_outputs = []
+            else:
+                for idx in indices:
+                    row = dataframe.loc[idx]
+                    key_dict = {key: row[input_keys[key]] for key in need_fields}
+                    prompt_text = self.prompt_template.build_prompt(need_fields, **key_dict)
+                    llm_inputs.append(prompt_text)
+                self.logger.info(f"Prepared {len(llm_inputs)} prompts for LLM generation.")
+                generated_outputs = self.llm_serving.generate_from_input(llm_inputs)
+            # write generated outputs back only to the selected rows (preserve other rows as None)
+            for idx, gen_output in zip(indices, generated_outputs):
+                if gen_output != "ORIGINAL":
+                    dataframe.at[idx, output_key] = gen_output
+        output_file = self.storage.write(dataframe)
+        return output_key

operators/vqa_answer_generator.py ADDED Viewed

	@@ -0,0 +1,229 @@

+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow import get_logger
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.core import OperatorABC
+from dataflow.core import LLMServingABC
+from dataflow.prompts.reasoning.math import MathAnswerGeneratorPrompt
+from dataflow.prompts.reasoning.general import GeneralAnswerGeneratorPrompt
+from dataflow.prompts.reasoning.diy import DiyAnswerGeneratorPrompt
+from dataflow.core.prompt import prompt_restrict, DIYPromptABC
+import pandas as pd
+from typing import Union, List, Tuple
+import re
+import os
+@prompt_restrict(
+    MathAnswerGeneratorPrompt,
+    GeneralAnswerGeneratorPrompt,
+    DiyAnswerGeneratorPrompt
+)
+@OPERATOR_REGISTRY.register()
+class VQAReasoningAnswerGenerator(OperatorABC):
+    '''
+    Answer Generator is a class that generates answers for given questions.
+    '''
+    def __init__(self,
+                llm_serving: LLMServingABC,
+                prompt_template: Union[MathAnswerGeneratorPrompt, GeneralAnswerGeneratorPrompt, DiyAnswerGeneratorPrompt, DIYPromptABC] = MathAnswerGeneratorPrompt,
+                skip_text_only: bool=False,
+                input_image_default_basedir = "./"
+                ):
+        self.logger = get_logger()
+        if prompt_template is None:
+            prompt_template = MathAnswerGeneratorPrompt()
+        self.prompts = prompt_template
+        self.llm_serving = llm_serving
+        self.skip_text_only = skip_text_only
+        self.input_image_default_basedir = input_image_default_basedir
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于为给定问题生成答案，调用大语言模型进行推理。\n"
+                "输入参数：\n"
+                "- llm_serving：LLM服务实例，用于生成答案\n"
+                "- prompt_template：提示模板对象，用于构建生成提示词\n"
+                "输出参数：\n"
+                "- output_key：生成的答案字段，默认'generated_cot'"
+            )
+        elif lang == "en":
+            return (
+                "This operator generates answers for given questions using LLMs for reasoning. \n"
+                "Input Parameters:\n"
+                "- llm_serving: LLM serving instance for answer generation\n"
+                "- prompt_template: Prompt template object for constructing generation prompts\n"
+                "Output Parameters:\n"
+                "- output_key: Generated answer field, default 'generated_cot'"
+            )
+        else:
+            return "AnswerGenerator produces answers for questions using large language models."
+    def _validate_dataframe(self, dataframe: pd.DataFrame):
+        required_keys = [self.input_key, self.input_image_basedir_key, self.input_caption_key, self.input_skip_key]
+        missing = [k for k in required_keys if k!=None and k not in dataframe.columns]
+        if missing:
+            raise ValueError(f"Missing required column(s): {missing}")
+    def _prepare_vlm_inputs(self, dataframe) -> Tuple[List[str], List[List[str]], List[List[str]], List[int], List[int]]:
+        """
+        Parses prompts for image markdown, extracts paths and text segments,
+        and structures them into interleaved lists for the VLM server.
+        返回:
+            user_prompts: List[str] (所有的问题)
+            list_of_image_paths: List[List[str]] (所有请求的绝对路径列表)
+            list_of_text_segments: List[List[str]] (所有图像标签)
+            vqa_ids: List[int] (含有图片的问题编号)
+            unskipped_ids: List[int] (未跳过的问题编号，这里跳过是指保留已经回答的答案，不重新回答）
+        """
+        list_of_image_paths: List[List[str]] = []
+        list_of_text_segments: List[List[str]] = []
+        user_prompts:List[str] = []
+        vqa_ids = []
+        # Markdown 图片正则匹配: ![label](path)
+        markdown_pattern = re.compile(r"!\[(.*?)\]\((.*?)\)")
+        questions = dataframe[self.input_key].tolist()
+        unskipped_ids = []
+        for index, question in enumerate(questions):
+            # 1. 确定 Base Directory (图像的根目录)
+            base_dir = self.input_image_default_basedir
+            if self.input_image_basedir_key in dataframe.columns:
+                row_base_dir = dataframe.loc[index, self.input_image_basedir_key]
+                if row_base_dir:
+                    base_dir = row_base_dir
+            # 2. 准备该请求的结构
+            current_paths: List[str] = []
+            current_segments: List[str] = []
+            current_user_prompt: str = ""
+            last_end = 0
+            # 查找所有图片匹配项
+            matches = list(markdown_pattern.finditer(question))
+            # 3. 处理纯文本或交错文本/图像
+            if (not matches):
+                if (not self.skip_text_only):
+                    # 纯文本提示：直接构建提示并作为唯一的文本片段
+                    if self.input_skip_key != None and self.input_skip_key in dataframe.columns:
+                        if dataframe.loc[index, self.input_skip_key]:
+                            continue
+                    final_prompt_text = self.prompts.build_prompt(question)
+                    # 如果caption key存在，添加caption信息
+                    if self.input_caption_key != None and self.input_caption_key in dataframe.columns:
+                        captions = dataframe.loc[index, self.input_caption_key]
+                        if captions and isinstance(captions, list):
+                            for cap_i, caption in enumerate(captions):
+                                final_prompt_text += f"\n Description of image {cap_i+1}: {caption}"
+                    user_prompts.append(final_prompt_text)
+                    list_of_image_paths.append([])
+                    list_of_text_segments.append([])
+                    unskipped_ids.append(index)
+                continue
+            vqa_complete = True
+            # 4. 遍历匹配项，提取交错的文本片段和图像路径
+            for match in matches:
+                leading_text = question[last_end:match.start()].strip()
+                if leading_text:
+                    current_user_prompt += leading_text
+                label = match.group(1).strip()
+                path = match.group(2).strip()
+                current_segments.append(label)
+                # 4c. 记录绝对路径 (原始逻辑)
+                full_path = os.path.join(base_dir, path)
+                # 检查路径是否存在
+                if not os.path.isfile(full_path):
+                    self.logger.warning(f"Image file not found: {full_path} (from question index {index})")
+                    vqa_complete = False
+                    break
+                current_paths.append(full_path)
+                last_end = match.end()
+            trailing_text = question[last_end:].strip()
+            if trailing_text:
+                current_user_prompt += trailing_text
+            # 如果caption key存在，添加caption信息
+            if self.input_caption_key != None and self.input_caption_key in dataframe.columns:
+                captions = dataframe.loc[index, self.input_caption_key]
+                if captions and isinstance(captions, list):
+                    for cap_i, caption in enumerate(captions):
+                        current_user_prompt += f"\n Description of image {cap_i+1}: {caption}"
+            # 5. 存储该请求的结果
+            if vqa_complete:
+                vqa_ids.append(index)
+                if self.input_skip_key != None and self.input_skip_key in dataframe.columns:
+                    if dataframe.loc[index, self.input_skip_key]:
+                        continue
+                list_of_image_paths.append(current_paths)
+                list_of_text_segments.append(current_segments)
+                user_prompts.append(self.prompts.build_prompt(current_user_prompt))
+                unskipped_ids.append(index)
+        return user_prompts, list_of_image_paths, list_of_text_segments, vqa_ids, unskipped_ids
+    def run(
+        self,
+        storage,
+        input_key:str = "instruction",
+        output_key:str = "generated_cot",
+        input_caption_key: str | None = None,
+        input_skip_key: str | None = None,
+        input_image_basedir_key = "image_basedir",
+        ):
+        '''
+        Runs the answer generation process, reading from the input file and saving results to output.
+        '''
+        self.input_key, self.output_key = input_key, output_key
+        self.input_caption_key = input_caption_key
+        self.input_skip_key = input_skip_key
+        self.input_image_basedir_key = input_image_basedir_key
+        dataframe = storage.read("dataframe")
+        self._validate_dataframe(dataframe)
+        # 1. 准备 VLM 输入: 解析 Markdown 并获取路径和文本片段
+        user_prompts, list_of_image_paths, list_of_image_labels, vqa_ids, unskipped_ids = self._prepare_vlm_inputs(dataframe)
+        # 2. 获取 System Prompt (假设它存储在 self.prompts 对象中)
+        # 如果 self.prompts 没有 system_prompt 属性，则使用默认值。
+        system_prompt = "You are an intelligent chatbot good at college subjects."
+        answers = self.llm_serving.generate_from_input_multi_images(
+            list_of_image_paths=list_of_image_paths,
+            list_of_image_labels=list_of_image_labels,
+            system_prompt=system_prompt,
+            user_prompts=user_prompts
+        )
+        if self.skip_text_only:
+            # 只写入vqa_ids指明的行
+            dataframe = dataframe.loc[vqa_ids].copy() # 注意，这里不重置索引
+        for idx, ans in zip(unskipped_ids, answers):
+            dataframe.at[idx, self.output_key] = ans
+        output_file = storage.write(dataframe)
+        self.logger.info(f"Results saved to {output_file}")
+        return [output_key]

pipelines/curate_data.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import os
+import sys
+import json
+import json5
+import pandas as pd
+sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+from dataflow.operators.core_text import PandasOperator, FormatStrPromptedGenerator
+from operators.bench_evaluate import BenchDatasetEvaluatorQuestion
+from operators.answer_extractor import AnswerExtractionOperator
+from operators.question_refiner import AddMissingBlankOperator
+from operators.question_answer_clean import LLMTextCleanerOperator
+from dataflow.pipeline import PipelineABC
+from dataflow.serving import APILLMServing_request
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.reasoning import (
+    ReasoningAnswerGenerator,
+    ReasoningAnswerGroundTruthFilter
+)
+from dataflow.prompts.reasoning.general import GeneralAnswerGeneratorPrompt
+from prompts.curate_data import TypeClassifyPrompt, SubQuestionSplitingPrompt, QAFilterPrompt
+from prompts.question_refine import AddMissingBlankPrompt
+from prompts.question_answer_clean import TextCleaningPrompt
+from dataflow.operators.core_text import GeneralFilter
+import argparse
+import re
+import shutil
+class DataCurationPipeline(PipelineABC):
+    def __init__(self, input_file, api_url, model_name, max_workers=100):
+        super().__init__()
+        self.storage = FileStorage(
+            first_entry_file_name=input_file,
+            cache_path="./cache",
+            file_name_prefix="curate_data",
+            cache_type="jsonl",
+        )
+        self.llm_serving = APILLMServing_request(
+                api_url=f"{api_url}/chat/completions",
+                model_name=model_name,
+                max_workers=max_workers,
+        )
+        self.sub_qa_justify = FormatStrPromptedGenerator(
+            llm_serving = self.llm_serving,
+            prompt_template = SubQuestionSplitingPrompt()
+        )
+        self.sub_qa_spliter = PandasOperator(
+            [split_generated_content]
+        )
+        # Extract concise answers from solutions
+        self.answer_extractor = AnswerExtractionOperator(
+            llm_serving=self.llm_serving,
+            overwrite=False
+        )
+        # Classify question types
+        self.type_filter = FormatStrPromptedGenerator(
+            llm_serving = self.llm_serving,
+            prompt_template = TypeClassifyPrompt()
+        )
+        self.type_filter_processor = PandasOperator(
+            [extract_type_and_reason]
+        )
+        self.type_filter_executor = GeneralFilter(
+            filter_rules=[lambda df: df['type'].isin(["Calculation", "Fill-in", "Multiple-choice"])]
+        )
+        self.add_missing_blank = AddMissingBlankOperator(
+            llm_serving=self.llm_serving,
+            prompt_template=AddMissingBlankPrompt()
+        )
+        # Filter items with unverifiable or poorly paired QA
+        self.qa_filter = FormatStrPromptedGenerator(
+            llm_serving = self.llm_serving,
+            prompt_template = QAFilterPrompt()
+        )
+        self.qa_filter_processor = PandasOperator(
+            [extract_filter_result_and_reason]
+        )
+        self.qa_filter_executor = GeneralFilter(
+            filter_rules=[lambda df: df['filter_result'] == 'true']
+        )
+        # question和answer的非内容型过滤
+        self.text_cleaner = LLMTextCleanerOperator(
+            llm_serving=self.llm_serving,
+            prompt_template=TextCleaningPrompt()
+        )
+    def forward(self):
+        self.sub_qa_justify.run(
+            storage = self.storage.step(),
+            output_key = "split_qa",
+            input_question  = "question",
+            input_answer = "answer",
+            input_solution = "solution",
+        )
+        self.sub_qa_spliter.run(
+            storage = self.storage.step(),
+        )
+        self.type_filter.run(
+            storage = self.storage.step(),
+            input_question = "question",
+            input_answer = "answer",
+            output_key = "question_type"
+        )
+        self.type_filter_processor.run(
+            storage = self.storage.step(),
+        )
+        self.type_filter_executor.run(
+            storage = self.storage.step(),
+        )
+        self.answer_extractor.run(
+            storage = self.storage.step(),
+            input_question_key= "question",
+            input_solution_key = "solution",
+            output_key= "answer"
+        )
+        self.add_missing_blank.run(
+            storage = self.storage.step(),
+            input_question = "question",
+            input_answer = "answer",
+            output_key = "question",
+        )
+        self.text_cleaner.run(
+            storage=self.storage.step(),
+            question_column="question",
+            answer_column="answer",
+            output_key="cleaned_dataframe"
+        )
+        self.qa_filter.run(
+            storage = self.storage.step(),
+            input_question = "question",
+            input_answer = "answer",
+            output_key = "qa_judgement"
+        )
+        self.qa_filter_processor.run(
+            storage = self.storage.step(),
+        )
+        self.qa_filter_executor.run(
+            storage = self.storage.step(),
+        )
+def split_generated_content(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    将 DataFrame 中 'split_qa' 列的 JSON 数组拆分为多行。
+    保留原始其他列，并展开每个 sub_question/sub_answer。
+    如果 sub_question 或 sub_answer 为空，则不保留该行。
+    """
+    rows = []
+    for _, row in df.iterrows():
+        content = row.get("split_qa", None)
+        if pd.isna(content) or not str(content).strip():
+            continue
+        try:
+            # 解析 JSON 数组
+            items = json5.loads(content)
+            if not isinstance(items, list):
+                items = [items]
+        except:
+            print(f"⚠️ JSON parse error in row: {content[:80]}...")
+            continue
+        for item in items:
+            sub_question = item.get("sub_question", "").strip()
+            sub_answer = item.get("sub_answer", "").strip()
+            sub_solution = item.get("sub_solution", "").strip()
+            # 只保留同时存在 sub_question 和 sub_answer 的行
+            if not sub_question or not (sub_answer or sub_solution):
+                continue
+            new_row = row.to_dict()
+            # new_row["sub_id"] = item.get("sub_id", None)
+            new_row["question"] = sub_question if sub_question != "ORIGINAL" else row["question"]
+            new_row["answer"] = sub_answer if sub_answer != "ORIGINAL" else row["answer"]
+            new_row["solution"] = sub_solution if sub_solution != "ORIGINAL" else row.get("solution", "")
+            rows.append(new_row)
+    if not rows:
+        return pd.DataFrame(columns=list(df.columns))
+    return pd.DataFrame(rows, columns=list(df.columns))
+def extract_type_and_reason(df: pd.DataFrame) -> pd.DataFrame:
+    df["type"] = None
+    df["type_reason"] = None
+    for idx, row in df.iterrows():
+        val = row.get("question_type", "")
+        if pd.isna(val) or not str(val).strip():
+            continue
+        try:
+            # 尝试解析 JSON
+            j = json.loads(val)
+            df.at[idx, "type"] = j.get("type", None)
+            df.at[idx, "type_reason"] = j.get("reason", None)
+        except json.JSONDecodeError:
+            # 如果不是 JSON 格式，尝试按 ":" 分割
+            if ":" in val:
+                parts = val.split(":", 1)
+                df.at[idx, "type"] = parts[0].strip()
+                df.at[idx, "type_reason"] = parts[1].strip()
+            else:
+                df.at[idx, "type"] = val.strip()
+                df.at[idx, "type_reason"] = ""
+    return df
+def extract_filter_result_and_reason(df: pd.DataFrame) -> pd.DataFrame:
+    df["filter_result"] = None
+    df["filter_reason"] = None
+    for idx, row in df.iterrows():
+        val = row.get("qa_judgement", "")
+        if pd.isna(val) or not str(val).strip():
+            continue
+        try:
+            # 尝试解析 JSON
+            j = json.loads(val)
+            judgement = j.get("judgement", "")
+            if isinstance(judgement, bool):
+                judgement = "true" if judgement else "false"
+            df.at[idx, "filter_result"] = judgement.lower()
+            df.at[idx, "filter_reason"] = j.get("reason", None)
+        except json.JSONDecodeError:
+            df.at[idx, "filter_result"] = ""
+            df.at[idx, "filter_reason"] = ""
+    return df
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Data Curation Pipeline")
+    parser.add_argument("--input_file", type=str, required=True, help="Path to the input JSONL file (raw_vqa.jsonl)")
+    parser.add_argument("--api_url", type=str, default="https://api.openai.com/v1", help="Base URL of the OpenAI-compatible API (e.g. https://api.openai.com/v1)")
+    parser.add_argument("--model", type=str, default="gpt-5-mini", help="LLM model name to use for curation")
+    parser.add_argument("--max_workers", type=int, default=100, help="Number of parallel API workers")
+    args = parser.parse_args()
+    model = DataCurationPipeline(args.input_file, api_url=args.api_url, model_name=args.model, max_workers=args.max_workers)
+    model.compile()
+    model.forward()
+    # Find the latest curate_data cache step file
+    cache_files = os.listdir("./cache")
+    step_files = [f for f in cache_files if re.match(r"curate_data_step\d+\.jsonl", f)]
+    step_numbers = [int(re.findall(r"curate_data_step(\d+)\.jsonl", f)[0]) for f in step_files]
+    max_step = max(step_numbers)
+    max_step_file = f"./cache/curate_data_step{max_step}.jsonl"
+    # Copy final step file to output directory as curated_vqa.jsonl
+    # Output is placed alongside input_file so relative image paths remain valid
+    output_dir = os.path.dirname(args.input_file)
+    output_file = os.path.join(output_dir, "curated_vqa.jsonl")
+    shutil.copy(max_step_file, output_file)
+    print(f"Curated data saved to: {output_file}")

pipelines/generate_cot.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import os
+import sys
+import pandas as pd
+sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+from dataflow.operators.core_text import PandasOperator
+from operators.bench_evaluate import BenchDatasetEvaluatorQuestion
+from operators.vqa_answer_generator import VQAReasoningAnswerGenerator
+from dataflow.serving import APILLMServing_request, APIVLMServing_openai, LocalVLMServing_vllm
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.reasoning import (
+    ReasoningAnswerGenerator,
+    ReasoningAnswerGroundTruthFilter
+)
+from dataflow.prompts.reasoning.math import MathAnswerGeneratorPrompt
+from dataflow.operators.core_text import GeneralFilter
+from dataflow import get_logger
+from dataflow.pipeline import PipelineABC
+from typing import Iterable
+import re
+import argparse
+import shutil
+def make_remove_think_fn(input_key, output_key):
+    pattern = re.compile(r'<think>.*?</think>', flags=re.DOTALL | re.IGNORECASE)
+    def fn(df):
+        df = df.copy()
+        if input_key in df.columns:
+            def clean_text(t):
+                if pd.isna(t):
+                    return t
+                if "</think>" not in t:
+                    return t.strip()
+                s = "<think>" + str(t)
+                return pattern.sub("", s).strip()
+            df[output_key] = df[input_key].apply(clean_text)
+        return df
+    return fn
+class RejectSamplingPipeline(PipelineABC):
+    def __init__(self, first_entry_file_name, answer_api_url, judge_api_url, answer_model, judge_model,
+                 answer_api_key_env="DF_API_KEY", judge_api_key_env="DF_API_KEY",
+                 max_retries=5, max_workers=100):
+        super().__init__()
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file_name,
+            cache_path="./cot_cache",
+            file_name_prefix="reject_sampling",
+            cache_type="jsonl",
+        )
+        self.max_retries = max_retries
+        self.logger = get_logger()
+        self.llm_answer_serving = APIVLMServing_openai(
+                api_url=answer_api_url,
+                model_name=answer_model,
+                key_name_of_api_key=answer_api_key_env,
+                max_workers=max_workers,
+                timeout=600.0,
+                max_tokens=8192,
+                temperature=0.7,
+        )
+        self.llm_serving = APILLMServing_request(
+                api_url=f"{judge_api_url}/chat/completions",
+                model_name=judge_model,
+                key_name_of_api_key=judge_api_key_env,
+                max_workers=max_workers,
+                read_timeout=300.0
+        )
+        # Difficulty filter (keep items where accuracy <= 1.0)
+        self.difficulty_filter = GeneralFilter(
+            filter_rules=[lambda df: df['accuracy'] <= 1.0]
+        )
+        # LLM answer generation
+        self.answer_generator = VQAReasoningAnswerGenerator(
+            llm_serving=self.llm_answer_serving,
+            prompt_template=MathAnswerGeneratorPrompt(),
+            skip_text_only=False,
+        )
+        self.think_cleaner = PandasOperator(process_fn=[ make_remove_think_fn(input_key="generated_cot", output_key="llm_short_answer") ])
+        self.noop = PandasOperator(process_fn=[ lambda df: df ])
+        # LLM verification
+        self.answer_groundtruth_filter = BenchDatasetEvaluatorQuestion(
+            compare_method="semantic",
+            llm_serving=self.llm_serving,
+            prompt_template=None, # using default prompt
+            eval_result_path="./cot_cache/eval_results.jsonl",
+            support_subquestions=True,
+            skip_true=True
+        )
+    def forward(self):
+        self.noop.run(storage = self.storage.step(), output_key="answer_match_result") # for pipeline compilation, do nothing
+        for i in range(self.max_retries):
+            input_skip_key="answer_match_result" if i > 0 else None
+            # Generate answers (skip items already answered correctly)
+            self.answer_generator.run(
+                storage = self.storage.step(),
+                input_key = "question",
+                output_key = "generated_cot",
+                input_skip_key=input_skip_key,
+                input_image_basedir_key="image_basedir",
+            )
+            self.think_cleaner.run(storage = self.storage.step(), output_key="llm_short_answer")
+            self.answer_groundtruth_filter.run(
+                storage=self.storage.step(),
+                input_test_answer_key="llm_short_answer",
+                input_gt_answer_key="answer",
+                input_question_key="question",
+            )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="CoT Generation Pipeline with Reject Sampling")
+    parser.add_argument("--input_file", type=str, required=True, help="Path to the input JSONL file (curated_vqa.jsonl)")
+    parser.add_argument("--max_retries", type=int, default=5, help="Maximum number of reject sampling rounds")
+    parser.add_argument("--answer_api_url", type=str, default="https://api.xxx.com/v1", help="Url where you serve your qwen model (e.g. via vllm)")
+    parser.add_argument("--judge_api_url", type=str, default="https://api.openai.com/v1", help="Base URL of the OpenAI-compatible API for answer verification (e.g. https://api.openai.com/v1)")
+    parser.add_argument("--answer_model", type=str, default="qwen3-vl-235b-thinking", help="Model to use for answer generation")
+    parser.add_argument("--judge_model", type=str, default="gpt-5-mini", help="Model to use for answer verification")
+    parser.add_argument("--answer_api_key_env", type=str, default="DF_API_KEY", help="Environment variable name holding the API key for the answer model")
+    parser.add_argument("--judge_api_key_env", type=str, default="DF_API_KEY", help="Environment variable name holding the API key for the judge model")
+    parser.add_argument("--max_workers", type=int, default=100, help="Number of parallel API workers")
+    args = parser.parse_args()
+    model = RejectSamplingPipeline(
+        args.input_file,
+        answer_api_url=args.answer_api_url,
+        judge_api_url=args.judge_api_url,
+        answer_model=args.answer_model,
+        judge_model=args.judge_model,
+        answer_api_key_env=args.answer_api_key_env,
+        judge_api_key_env=args.judge_api_key_env,
+        max_retries=args.max_retries,
+        max_workers=args.max_workers,
+    )
+    model.compile()
+    model.forward()
+    # Find the latest reject_sampling cache step file
+    cache_files = os.listdir("./cot_cache")
+    step_files = [f for f in cache_files if re.match(r"reject_sampling_step\d+\.jsonl", f)]
+    step_numbers = [int(re.findall(r"reject_sampling_step(\d+)\.jsonl", f)[0]) for f in step_files]
+    max_step = max(step_numbers)
+    max_step_file = f"./cot_cache/reject_sampling_step{max_step}.jsonl"
+    # Copy output alongside input_file so relative image paths remain valid
+    output_dir = os.path.dirname(args.input_file)
+    output_file = os.path.join(output_dir, "curated_vqa_with_cot.jsonl")
+    shutil.copy(max_step_file, output_file)
+    print(f"Curated data with cot saved to: {output_file}")

pipelines/vqa_extract_optimized_pipeline.py ADDED Viewed

	@@ -0,0 +1,134 @@

+from dataflow.operators.knowledge_cleaning import FileOrURLToMarkdownConverterAPI
+from dataflow.serving import APILLMServing_request
+from dataflow.utils.storage import FileStorage
+from operators.pdf2vqa import MinerU2LLMInputOperator, LLMOutputParser, QA_Merger, PDF_Merger
+from dataflow.operators.core_text import ChunkedPromptedGenerator
+from dataflow.pipeline import PipelineABC
+from prompts.pdf2vqa import QAExtractPrompt
+from pypdf import PdfWriter
+import os
+import json
+import re
+import argparse
+class PDF_VQA_extract_optimized_pipeline(PipelineABC):
+    def __init__(self, input_file, api_url, model_name, max_workers=100):
+        super().__init__()
+        self.storage = FileStorage(
+            first_entry_file_name=input_file,
+            cache_path="./cache",
+            file_name_prefix="vqa",
+            cache_type="jsonl",
+        )
+        self.llm_serving = APILLMServing_request(
+            api_url=f"{api_url}/chat/completions",
+            key_name_of_api_key="DF_API_KEY",
+            model_name=model_name,
+            max_workers=max_workers,
+        )
+        self.vqa_extract_prompt = QAExtractPrompt()
+        self.pdf_merger = PDF_Merger(output_dir="./cache")
+        self.mineru_executor = FileOrURLToMarkdownConverterAPI(intermediate_dir = "intermediate")
+        self.input_formatter = MinerU2LLMInputOperator()
+        self.vqa_extractor = ChunkedPromptedGenerator(
+            llm_serving=self.llm_serving,
+            system_prompt = self.vqa_extract_prompt.build_prompt(),
+            max_chunk_len=128000,
+        )
+        self.llm_output_parser = LLMOutputParser(output_dir="./cache", intermediate_dir="intermediate")
+        self.qa_merger = QA_Merger(output_dir="./cache", strict_title_match=False)
+    def forward(self):
+        self.pdf_merger.run(
+            storage=self.storage.step(),
+            input_pdf_list_key="input_pdf_paths",
+            input_name_key="name",
+            output_pdf_path_key="merged_pdf_path",
+        )
+        self.mineru_executor.run(
+            storage=self.storage.step(),
+            input_key="merged_pdf_path",
+            output_key="vqa_markdown_path",
+        )
+        self.input_formatter.run(
+            storage=self.storage.step(),
+            input_markdown_path_key="vqa_markdown_path",
+            output_converted_layout_key="converted_vqa_layout_path",
+        )
+        self.vqa_extractor.run(
+            storage=self.storage.step(),
+            input_path_key="converted_vqa_layout_path",
+            output_path_key="extracted_llm_vqa_path",
+        )
+        self.llm_output_parser.run(
+            storage=self.storage.step(),
+            input_response_path_key="extracted_llm_vqa_path",
+            input_converted_layout_path_key="converted_vqa_layout_path",
+            input_name_key="name",
+            output_qalist_path_key="extracted_vqa_path",
+        )
+        self.qa_merger.run(
+            storage=self.storage.step(),
+            input_qalist_path_key="extracted_vqa_path",
+            input_name_key="name",
+            output_merged_qalist_path_key="output_merged_vqalist_path",
+            output_merged_md_path_key="output_merged_md_path",
+            output_qa_item_key="vqa_pair",
+        )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run PDF VQA Extract Optimized Pipeline")
+    parser.add_argument("--input_file", type=str, default="./examples/VQA/vqa_extract_test.jsonl", help="Path to the input JSONL file")
+    parser.add_argument("--output_dir", type=str, default="./output", help="Directory to save the output files")
+    parser.add_argument("--api_url", type=str, default="https://generativelanguage.googleapis.com/v1beta/openai/", help="Base URL of the OpenAI-compatible API (e.g. https://api.openai.com/v1)")
+    parser.add_argument("--model", type=str, default="gemini-2.5-pro", help="LLM model name to use for VQA extraction, please use powerful reasoning models")
+    parser.add_argument("--max_workers", type=int, default=100, help="Number of parallel API workers")
+    args = parser.parse_args()
+    pipeline = PDF_VQA_extract_optimized_pipeline(
+        input_file=args.input_file,
+        api_url=args.api_url,
+        model_name=args.model,
+        max_workers=args.max_workers,
+    )
+    pipeline.compile()
+    pipeline.forward(resume_step=5)
+    output_dir = args.output_dir
+    os.makedirs(output_dir, exist_ok=True)
+    # Find the latest cache step file
+    cache_files = os.listdir("./cache")
+    step_files = [f for f in cache_files if re.match(r"vqa_step\d+\.jsonl", f)]
+    step_numbers = [int(re.findall(r"vqa_step(\d+)\.jsonl", f)[0]) for f in step_files]
+    max_step = max(step_numbers)
+    max_step_file = f"./cache/vqa_step{max_step}.jsonl"
+    # Extract QA items and save to output_dir/raw_vqa.jsonl
+    output_qa_item_key = "vqa_pair"
+    with open(max_step_file, "r") as f_in, open(os.path.join(output_dir, "raw_vqa.jsonl"), "w") as f_out:
+        for line in f_in:
+            data = json.loads(line)
+            qa_item = data[output_qa_item_key]
+            name = data["name"]
+            output_data = {"name": name, **qa_item, "image_basedir": os.path.abspath(output_dir)}
+            if not output_data["solution"]:
+                output_data["solution"] = output_data["answer"]
+            f_out.write(json.dumps(output_data, ensure_ascii=False) + "\n")
+            # Copy per-task image directory to output_dir
+            src_dir = os.path.join("cache", name)
+            if os.path.exists(src_dir):
+                os.system(f"cp -r {src_dir} {output_dir}")

prompts/bench_evaluate.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from dataflow.utils.registry import PROMPT_REGISTRY
+from dataflow.core.prompt import PromptABC
+'''
+A collection of prompts for model evaluation.
+'''
+@PROMPT_REGISTRY.register()
+class AnswerJudgePromptQuestion(PromptABC):
+    """
+    用于构建答案评判的提示词模板
+    """
+    def __init__(self):
+        pass
+    def build_prompt(self, question, answer, reference_answer):
+        prompt = f"""
+        As an answer evaluation expert, please assess whether the following answer is correct.
+        Question: {question}
+        Reference Answer: {reference_answer}
+        Current Answer: {answer}
+        Please carefully analyze whether the current answer is semantically consistent with the reference answer.
+        Focus only on comparing the answers themselves, not on how the problem is solved.
+        Don't just look at the surface text, understand the essential content of the answers.
+        If the current answer is semantically consistent with the reference answer, even if expressed differently, it should be judged as correct.
+        For numerical calculation problems, also consider whether the answer is within the acceptable error range (typically 5%). Be careful to differentiate whether the question is indeed a numerical calculation or one that requires a strictly identical answer.
+        Please return your judgment result in JSON format:
+        {{"judgement_result": true}} indicates the answer is correct
+        {{"judgement_result": false}} indicates the answer is incorrect
+        Your judgment:
+        """
+        return prompt
+@PROMPT_REGISTRY.register()
+class AnswerJudgeMultipleQuestionsPrompt(PromptABC):
+    """
+    用于构建答案评判的提示词模板，支持多个子问题的判断。
+    """
+    def __init__(self):
+        pass
+    def build_prompt(self, answer, reference_answer, question=None):
+        prompt = f"""
+        As an answer evaluation expert, please assess whether the following answer is correct.
+        Question: {question}
+        Reference Answer: {reference_answer}
+        Current Answer: {answer}
+        Please carefully analyze whether the current answer is semantically consistent with the reference answer.
+        Focus only on comparing the answers themselves, not on how the problem is solved.
+        Don't just look at the surface text, understand the essential content of the answers.
+        If the current answer is semantically consistent with the reference answer, even if expressed differently, it should be judged as correct.
+        For numerical calculation problems, also consider whether the answer is within the acceptable error range (typically 5%). Be careful to differentiate whether the question is indeed a numerical calculation or one that requires a strictly identical answer.
+        The question may contain multiple sub-questions (e.g., ①②③ or (a)(b), etc.).
+        You should first identify the sub-questions in the question, then evaluate the correctness of each corresponding part in the current answer.
+        You need to provide your reason for each sub-question's judgment.
+        Your judgement should be a JSON array, where each element is "true" or "false" (use string instead of boolean), indicating whether the answer to each sub-question is correct.
+        If there is only one question, also return a single-element array.
+        If the reference answer is incomplete so that you are not able to judge some subquestions, mark the corresponding sub-questions as "empty".
+        Example:
+        Question: ① 1+2=? ② What is 2+2? ③ What is 3+3?
+        Reference Answer: ① 3 ③ 6
+        Current Answer: ① Three ② Four ③ Seven
+        Output: {{"reason": "The answer to sub-question 1 is correct as 'Three' is semantically consistent with '3'. The reference answer does not provide information for sub-question 2, so it is marked as 'empty'. The answer to sub-question 3 is incorrect as 'Seven' is not semantically consistent with '6'.", "judgement": ["true", "empty", "false"]}}
+        Your judgment:
+        """
+        return prompt

prompts/curate_data.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import json
+from dataflow.utils.registry import PROMPT_REGISTRY
+from dataflow.core.prompt import PromptABC, DIYPromptABC
+from typing import Set
+import string
+@PROMPT_REGISTRY.register()
+class SubQuestionSplitingPrompt(DIYPromptABC):
+    def __init__(self, f_str_template: str = "{input_text}", on_missing: str = "raise"):
+        self.f_str_template ="""
+        You are an educational question structure analysis assistant. Below is a composite question and its corresponding answer. Please split it into several independent sub-questions.
+The requirements are as follows:
+1. The question may contain multiple sub-questions (e.g., ①②③ or (a)(b), etc.); please accurately identify and split them one by one. Only split sub-questions with clear labels.
+    Do not split implicit sub-questions (such as "What is the value of x and y?" or multiple question marks).
+2. Each sub-question must be self-contained and answerable. If the original question contains contextual information, include it in each sub-question to preserve full meaning.
+    If sub-questions are related (e.g., "① Find x. ② Using the value of x, find y."), do not split them; keep them as one sub-question.
+3. If an answer or/and solution is provided, try to match each sub-question with its corresponding part of the answer or/and solution based on semantics.
+4. If the original answer or/and solution contains LaTeX formulas, preserve them exactly as they appear.
+5. If the original answer or/and solution is missing or cannot be clearly aligned, leave `"sub_answer"` or/and `"sub_solution"` as an empty string.
+6. The output must be a valid JSON array, where each element contains:
+   * `"sub_id"`: the index of the sub-question (an integer starting from 1)
+   * `"sub_question"`: the complete text of the sub-question (or "ORIGINAL" if no splitting is needed)
+   * `"sub_answer"`: the corresponding answer, empty string if unavailable (or "ORIGINAL" if no splitting is needed)
+   * `"sub_solution"`: the corresponding solution, empty string if unavailable (or "ORIGINAL" if no splitting is needed)
+[Important Notice]
+1. In some questions, answers or solutions, there will be figures written as `![image](image_url)`. When splitting, please keep these figure references in the corresponding sub-questions, sub-answers, or sub-solutions as EXACTLY what they are.
+2. If the question does not need to be split, return an array with a single element, simplified as: [{"sub_id": 1, "sub_question": "ORIGINAL", "sub_answer": "ORIGINAL", "sub_solution": "ORIGINAL"}]
+    In this case, you only need to output "ORIGINAL" instead of the full text for sub_question, sub_answer, and sub_solution, so that we can save tokens.
+## Example Input:
+**Question:**
+A class has 40 students, including 25 boys and 15 girls. ![image](question_images/a284h5iuh38.jpg) ① Find the percentage of boys in the class. ② Find the percentage of girls in the class.
+**Answer:**
+① 62.5%. ② 37.5%.
+**Solution:**
+Percentage of boys = (25/40) * 100 = 62.5%, percentage of girls = (15/40) * 100 = 37.5%.
+----------------------------------------------
+## Example Output:
+```json
+[
+  {
+    "sub_id": 1,
+    "sub_question": "A class has 40 students, including 25 boys and 15 girls. ![image](question_images/a284h5iuh38.jpg) Find the percentage of boys in the class.",
+    "sub_answer": "62.5%.",
+    "sub_solution": "Percentage of boys = (25/40) * 100 = 62.5%."
+  },
+  {
+    "sub_id": 2,
+    "sub_question": "A class has 40 students, including 25 boys and 15 girls. ![image](question_images/a284h5iuh38.jpg) Find the percentage of girls in the class.",
+    "sub_answer": "37.5%.",
+    "sub_solution": "Percentage of girls = (15/40) * 100 = 37.5%."
+  }
+]
+```
+      Now, please split the following question according to the above requirements:
+      [Question]
+      {input_question}
+      [Answer]
+      {input_answer}
+      [Solution]
+      {input_solution}
+      """
+    def build_prompt(self, need_fields, **kwargs):
+        # 校验缺失字段
+        missing = [f for f in need_fields if f not in kwargs]
+        if missing:
+            if self.on_missing == "raise":
+                raise KeyError(f"Missing fields for prompt: {missing}")
+            # 宽松模式：用空串补齐
+            for f in missing:
+                kwargs[f] = ""
+        prompt = self.f_str_template
+        for key, value in kwargs.items():
+            prompt = prompt.replace(f"{{{key}}}", str(value))
+        return prompt
+@PROMPT_REGISTRY.register()
+class TypeClassifyPrompt(DIYPromptABC):
+    def __init__(self, f_str_template: str = "{input_text}", on_missing: str = "raise"):
+        self.f_str_template ='''
+[Role]
+You are an education expert familiar with textbook question formats at high school and university levels.
+Your task is to determine the question type based on the question and answer provided.
+[Possible Categories]
+Choose exactly one of the following types:
+1. Proof problem - requires proving a statement, identity, inequality, or property.
+2. Explanation problem - asks for reasoning, causes, interpretation, principle, or conceptual explanation.
+3. Fill-in problem - asks to fill in blanks, complete missing expressions, or supply intermediate steps.
+4. Calculation problem - involves explicit numerical or symbolic computation, formula manipulation, or value derivation.
+Even if the final answer is a short conclusion such as “thus xxx increases” or “so the velocity decreases,”
+it should still be considered a Calculation problem if the majority of the reasoning is computational.
+5. Multiple-choice problem - asks to choose or identify the correct option (e.g., “Which of the following…”).
+6. Sketching/Plotting problem - requires sketching a figure, diagram, graph, or geometric representation.
+7. Other - for tasks that don't fit any of the above types.
+[Judgment Rules]
+1. If the problem explicitly says “prove,” “show that,” “derive,” and does not have a short final answer → classify as Proof problem.
+2. If it mainly contains explanations, reasoning, or conceptual analysis without detailed calculation → Explanation problem.
+3. If the question has blanks, missing terms, or placeholders (e.g., “( )” or “____”), or the question seems **incomplete** → Fill-in problem.
+4. If there are multiple formula derivations, substitutions, or numeric results → Calculation problem,
+even if followed by a brief explanatory conclusion.
+5. If it asks to select the correct answer among options (A/B/C/D, etc.) → Multiple-choice problem.
+6. If the question explicitly requires producing a figure, diagram, plot, or geometric construction → Sketching/Plotting problem.
+7. If none of these clearly apply or the problem type is mixed → Other.
+[Output Format]
+Return a JSON object with the following fields:
+{
+  "type": "Calculation | Proof | Explanation | Fill-in | Multiple-choice | Sketching/Plotting | Other",
+  "reason": "Brief justification for the classification."
+}
+Please determine the type of the following question and output only one of the above category names.
+(Proof, Explanation, Fill-in, Calculation, Multiple-choice, Sketching, Other).
+[Question]
+{input_question}
+[Answer]
+{input_answer}
+        '''
+    def build_prompt(self, need_fields, **kwargs):
+        # 校验缺失字段
+        missing = [f for f in need_fields if f not in kwargs]
+        if missing:
+            if self.on_missing == "raise":
+                raise KeyError(f"Missing fields for prompt: {missing}")
+            # 宽松模式：用空串补齐
+            for f in missing:
+                kwargs[f] = ""
+        prompt = self.f_str_template
+        for key, value in kwargs.items():
+            prompt = prompt.replace(f"{{{key}}}", str(value))
+        return prompt
+@PROMPT_REGISTRY.register()
+class QAFilterPrompt(DIYPromptABC):
+    """
+    用于过滤不合适的问答对的Prompt
+    """
+    def __init__(self):
+        self.f_str_template = """
+        [Role]
+        You are an education expert familiar with textbook question formats at high school and university levels.
+        Your task is to determine whether the provided question and answer pair is suitable to serve as a problem in an exam.
+        Question: {input_question}
+        Answer: {input_answer}
+        [Criteria]
+        1. Clarity: The question must be suitable for an exam setting, meaning it should raise **a clear problem** that requires a specific solution.
+            Examples, **statements without questions**, open-ended discussions and other context that do not pose a clear problem are not suitable.
+            Questions like "Give an example of..." that can have many valid answers are also not suitable.
+            You should be particularly careful with questions that **only provide a topic or theme** without a specific problem to solve.
+            For instance, "all primes less than 100" is not a valid question, because it does not specify what to do (listing, counting, ...) with those primes.
+            Instead, a question like "List all primes less than 100" or "How many primes are there less than 100?" would be suitable.
+        2. Relevance: The answer must directly address the question asked.
+            If the answer seems to be addressing a different question and is wrongly paired with the given question, it is not suitable.
+        3. Completeness and Self-Containment: The question and answer should be complete and self-contained, providing all necessary information for understanding and solving it without requiring external context.
+           Questions that rely heavily on prior context or external references are not suitable.
+           Answers such as "Refer to theorem X", "Corollary of previous result", "Answered in the text above", "Omitted for brevity" are not acceptable.
+           Incomplete questions or answers that leave out critical information are also not suitable.
+        4. Explicit Task Requirement: The question must contain an explicit task phrase (such as "compute", "determine", "find", "prove", "list", "show", "give the value of", etc.).
+            Pure expressions or noun phrases are NOT acceptable even if they are commonly understood as implicit tasks in mathematical contexts.
+            If the question does not include an explicit verb specifying what the student must do, it must be judged unsuitable.
+            Of course, if the question is in a multiple-choice or fill-in-the-blank format, the choices or blanks themselves will serve as the explicit task requirement.
+        [Important Notice]
+        1. You do not need to evaluate the correctness of the answer, only whether it is appropriate and complete in relation to the question.
+        2. Short answer with no explanation (calculation, proof, counterexample, ...) is acceptable as long as it directly addresses the question.
+        3. There might be figures in the question or answer, represented as `![image](image_url)`. However, we do not give you that.
+            You can assume that if the question or answer contains such figure references, they are correctly placed and provide necessary information.
+        4. Sometimes in a fill-in question, the blanks like "___" may be missing due to OCR errors. In this case, if the question is otherwise clear and complete, you can still judge it as suitable.
+        5. You should be very strict in your evaluation. If any of the criteria above are not fully met, the question-answer pair should be considered unsuitable.
+        [Output Format]
+        Return a JSON object with the following fields:
+        {
+        "reason": "Brief justification of your judgement."
+        "judgement": "true | false",
+        }
+        Your judgment:
+        """
+    def build_prompt(self, need_fields, **kwargs):
+        # 校验缺失字段
+        missing = [f for f in need_fields if f not in kwargs]
+        if missing:
+            if self.on_missing == "raise":
+                raise KeyError(f"Missing fields for prompt: {missing}")
+            # 宽松模式：用空串补齐
+            for f in missing:
+                kwargs[f] = ""
+        prompt = self.f_str_template
+        for key, value in kwargs.items():
+            prompt = prompt.replace(f"{{{key}}}", str(value))
+        return prompt

prompts/pdf2vqa.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from dataflow.utils.registry import PROMPT_REGISTRY
+from dataflow.core.prompt import PromptABC
+@PROMPT_REGISTRY.register()
+class QAExtractPrompt(PromptABC):
+    def __init__(self):
+        pass
+    def build_prompt(self) -> str:
+        PROMPT = f"""
+        You are an expert in answer college-level questions. You are given a json file. Your task is to segment the content, insert images tags, and extract labels:
+1. Every json item has an "id" field. Your main task is to output this field.
+2. You need to segment the content into multiple `<qa_pair>`…`</qa_pair>` blocks, each containing a question and its corresponding answer with solution.
+3. If the problem or answer/solution is not complete, omit them. An answer/solution should be considered complete as long as either the answer or solution exists.
+4. You need to put the images id into proper positions. You could look at the caption or context to decide where to put the image tags.
+5. You will also need to extract the chapter title and each problem's label/number from the text.
+6. You only need to output "id" field for **chapter titles, questions and solutions**. DO NOT OUTPUT ORIGINAL TEXT. Use ',' to separate different ids.
+7. However, use original labels/numbers for labels, and use original numbers for answers. DO NOT output "id" field for labels and answers. You will need to extract them from the text.
+"""
+        PROMPT +=f"""
+Strict extraction rules:
+** About questions and answers/solutions **
+- Preserve each problem’s original label/number, such as "例1", "Example 3", "习题1", "11". Do not include the period after the number. Use Arabic numerals only. For example, if the label is "例一", convert it to "例1". If the label is "IV", convert it to "4".
+- If the full label is "三、16", keep only "16". If the full label is "5.4", keep only "4".
+- If there are multiple sub-questions (such as "(1)", "(a)") under one main question, always put them together in the same `<qa_pair>`…`</qa_pair>` block.
+- If a question and its answer/solution are contiguous, wrap them together as a single `<qa_pair>`…`</qa_pair>` block, e.g.:
+  `<qa_pair><label>1</label><question>…</question><answer>…</answer><solution>…</solution></qa_pair>`
+- If a question and its answer/solution are NOT contiguous (e.g. only question; only answer and/or solution; all questions at the front and all answers/solutions at the back), wrap each question or answer/solution in a `<qa_pair>`…`</qa_pair>` block with the missing part left empty. For example, if only questions appear:
+  `<qa_pair><label>1</label><question>…</question><answer></answer><solution></solution></qa_pair>`
+- In total, there are 7 possibilities: only question, only answer, only solution, question with answer, question with solution, answer with solution, full question and answer and solution.
+- If multiple qa pairs appear, wrap each qa pair in its own `<qa_pair>`…`</qa_pair>` block.
+- If you do not see the full solution, only extract the short answer and leave the solution empty. YOU MUST KEEP SHORT ANSWERS !!!
+** About chapter/section titles **
+- Always enclose qa pairs in a `<chapter>`…`</chapter>` block, where <title>MAIN_TITLE_ID</title> is the id of the chapter title or section title.
+- Normally, chapter/section titles appear before the questions/answers in an independent json item.
+- There could be multiple `<chapter>`…`</chapter>` blocks if multiple chapters/sections exist.
+- **Any title followed by a question/answer whose label/number is not 1, or title with a score such as "一、选择题（每题1分，共10分）", should NOT be extracted.**
+- Do not use nested titles.
+- Leave the title blank if there is no chapter title.
+** About figures/diagrams **
+- Whenever the question or answer/solution refers to a figure or diagram, record its "id" in question/answer/solution just like other text content.
+- You MUST include all images referenced in the question/answer/solution.
+If no qualifying content is found, output:
+<empty></empty>
+Output format (all tags run together, no extra whitespace or newlines except between entries):
+<chapter><title>MAIN_TITLE_ID</title>
+<qa_pair><label>LABEL(EXTRACTED FROM TEXT)</label><question>QUESTION_IDS</question>
+<answer>ANSWER(EXTRACTED FROM SOLUTION)</answer><solution>SOLUTION_IDS</solution></qa_pair>
+<qa_pair><label>LABEL(EXTRACTED FROM TEXT)</label><question>QUESTION_IDS</question>
+<answer>ANSWER(EXTRACTED FROM SOLUTION)</answer><solution></solution></qa_pair>
+</chapter>
+<chapter><title>MAIN_TITLE_ID</title>
+<qa_pair><label>LABEL(EXTRACTED FROM TEXT)</label><question>QUESTION_IDS</question>
+<answer>ANSWER(EXTRACTED FROM SOLUTION)</answer><solution>SOLUTION_IDS</solution></qa_pair>
+</chapter>
+Example:
+<chapter><title>7</title>
+<qa_pair><label>1</label><question>2,3</question>
+<answer>Yes</answer><solution>5,6,7</solution></qa_pair>
+<qa_pair><label>2</label><question>8,9,10</question>
+<answer>3.14</answer><solution></solution></qa_pair>
+</chapter>
+<chapter><title>12</title>
+<qa_pair><label>1</label><question></question>
+<answer>2^6</answer><solution>16</solution></qa_pair>
+</chapter>
+Please now process the provided json and output your result.
+"""
+        return PROMPT

prompts/question_answer_clean.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from dataflow.utils.registry import PROMPT_REGISTRY
+from dataflow.core.prompt import PromptABC
+@PROMPT_REGISTRY.register()
+class TextCleaningPrompt(PromptABC):
+    """
+    用于清洗文本中的非内容性信息片段
+    """
+    def __init__(self):
+        self.question_prompt_template = """你是一名数据清洗专家。请从以下题目文本中识别所有与问题实质无关的非内容性信息片段，这些片段应被完全移除。
+【非内容性信息定义】（应删除）：
+- 题号、例号、习题编号（如 "1.1"、"例3"、"Problem 2.4"、"习题2-5"）；
+- 章节标记（如 "§2.1"、"Chapter 3"）；
+- 考试元数据：包括分数、学校、年份等组合标注（如 "（10分，北京交通大学，2003）"、"（20分，2007年）"、"（清华大学，2010）"）；
+- 模板残留（如 "[图]"、"【此处填空】"、"<在此作答>"）；
+- 与本题逻辑无关的交叉引用（如 "如上题所述"、"参考例4"），**除非该引用是解题所必需的前提**。
+【重要内容定义】（必须保留，禁止删除）：
+- **图片引用**：包括 Markdown 图片语法（如 `![图2-1](question_images/xxx.jpg)`）、纯路径（如 `question_images/xxx.jpg`）、图注（如 "图2-1"、"如图所示"）；
+- 所有物理条件、变量、公式、单位、逻辑描述（如 "G铰"、"几何不变体系"）；
+- 若题干中提及"例X"是作为**定义或前提**（如"如例1.2中定义的模型"），则保留；否则（如开头的"例1"）应删除。
+【重要规则】：
+1. **不要重写、不要改写、不要总结**原始文本；
+2. **仅输出需要删除的子字符串**，多个片段用 `||` 分隔；
+3. 如果没有非内容性信息，输出 `NONE`；
+4. **必须原样输出片段**（包括空格、括号、标点、中文顿号等）；
+5. **特别注意**：任何包含 `question_images/` 的路径、`![...](...)` 结构、或"图X-X"形式的图标识，**一律不得删除**；
+6. 考试元数据（如"（10分，北京交通大学，2003）"）**必须整段删除**，包括括号。
+7. **务必保留必要的前提信息。**
+8. **删除后的问题，一定还能构成一个完整的问题**。输出前请三思。
+    例如对于"In Exercises 3-6, calculate the size of the set.\n4.  {{x|x is a prime number less than 10}}",
+    这个必要的前缀不应该被删除，你要删除的内容应该是 " In Exercises 3-6, || 4."。
+    如果你对这一点感到困惑、为难，请**务必保留更多内容，而不是删除**。
+9. 应**最小化**删除与题目无关的文本，不要过分删除。如果你有任何疑问，请优先选择保留，甚至直接输出 `NONE`。
+题目文本：{text}
+请输出待删除的片段（用 `||` 分隔）或 `NONE`："""
+        self.answer_prompt_template = """你是一名数据清洗专家。请从以下答案文本中识别所有与答案实质无关的非内容性信息片段，这些片段应被完全移除。
+【非内容性信息定义】：
+- 答案引导词（如 "答："、"答案："、"Solution:"、"解："）；
+- 习题引用（如 "（见习题2.3）"、"同例4"、"参考教材P30"）；
+- 模板残留（如 "[计算过程略]"、"{{result}}"）；
+- 与答案结论无关的附加说明（如 "详见附录"）；
+- 其他非答案核心内容的元信息。
+【重要规则】：
+1. 不要重写、不要改写、不要总结原始文本；
+2. 仅输出需要删除的子字符串，多个片段用 `||` 分隔；
+3. 如果没有非内容性信息，输出 `NONE`；
+4. 必须原样输出片段（包括冒号、空格、括号等）。
+答案文本：{text}
+请输出待删除的片段（用 `||` 分隔）或 `NONE`："""
+    def build_question_prompt(self, text):
+        """构建问题文本清洗的提示"""
+        return self.question_prompt_template.format(text=text)
+    def build_answer_prompt(self, text):
+        """构建答案文本清洗的提示"""
+        return self.answer_prompt_template.format(text=text)

prompts/question_refine.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from dataflow.utils.registry import PROMPT_REGISTRY
+from dataflow.core.prompt import PromptABC
+@PROMPT_REGISTRY.register()
+class AddMissingBlankPrompt(PromptABC):
+    """
+    用于补全填空题中的横线
+    """
+    def __init__(self):
+        self.f_str_template = """
+        [Role]
+        You are an education expert familiar with textbook question formats at high school and university levels.
+        You will be given a "fill-in-the-blank" question along with its answer.
+        However, the question may have some missing placeholders to indicate blanks.
+        You can use the provided answer to help determine where the blanks should be placed.
+        Ensure that the modified question clearly indicates all the blanks using placeholders.
+        Question: {input_question}
+        Answer: {input_answer}
+        [Important Notice]
+        1. If the original question already has some placeholders (coule be in different forms such as "()", "__", "____"), do not remove them. Instead, add any missing "___" based on the answer.
+            If the question is already complete with all necessary blanks (regardless of the form of placeholders), return "ORIGINAL" (no quote).
+        2. Do not change any other part of the question except for adding the missing "___" !!!
+        [Examples]
+        Original Question: The capital of France is  and the capital of British is ____.
+        Answer: Paris; London
+        Return: The capital of France is ___ and the capital of British is ____.
+        Original Question: The two right sides of a triangle are 3 and 4, then the third side is.
+        Answer: 5
+        Return: The two right sides of a triangle are 3 and 4, then the third side is ___.
+        Original Question: The area of a circle with radius r is ( ).
+        Answer: πr^2
+        Return: ORIGINAL
+        [Output Format]
+        Only output the full modified question with blanks represented by "___". Do not include any additional explanations or text.
+        """
+    def build_prompt(self, need_fields, **kwargs):
+        # 校验缺失字段
+        missing = [f for f in need_fields if f not in kwargs]
+        if missing:
+            if self.on_missing == "raise":
+                raise KeyError(f"Missing fields for prompt: {missing}")
+            # 宽松模式：用空串补齐
+            for f in missing:
+                kwargs[f] = ""
+        prompt = self.f_str_template
+        for key, value in kwargs.items():
+            prompt = prompt.replace(f"{{{key}}}", str(value))
+        return prompt

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+# Gradio UI
+gradio>=4.44.0
+# DataFlow core (install from GitHub; includes pdf2vqa extras)
+git+https://github.com/OpenDCAI/DataFlow.git#egg=dataflow[pdf2vqa]
+# Runtime dependencies used by curate_data.py
+json5
+pandas

utils/format_utils.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import json
+import re
+def refine_title(title: str, strict_title_match=False):
+    # TODO : 这里可能需要更复杂的title清洗逻辑
+    # 删除title中的空格与换行符
+    title = re.sub(r'\s+', '', title)
+    if not strict_title_match:
+        try:
+            # 优先提取阿拉伯数字章节编号（如1.1，2等）
+            new_title = re.search(r"\d+\.\d+|\d+", title).group()
+        except:
+            try:
+                # 其次提取中文数字章节编号（如六、二十四等）
+                new_title = re.search(r'[一二三四五六七八九零十百]+', title).group()
+            except Exception:
+                new_title = title
+        title = new_title
+    return title
+def merge_qa_pair(vqa_jsonl, output_jsonl, strict_title_match=False):
+    already_complete_count = 0
+    question_list = []
+    answer_list = []
+    with open(vqa_jsonl, 'r', encoding='utf-8') as vqa_file:
+        for line in vqa_file:
+            data = json.loads(line)
+            if data["question"] != "":
+                question_list.append(data)
+            else:
+                # 用于支持题目在前面，答案在后面的pdf
+                answer_list.append(data)
+    with open(output_jsonl, 'w', encoding='utf-8') as out_file:
+        chapter_id = 0
+        chapter_title = ""
+        label = float('inf')
+        questions = {}
+        answers = {}
+        for data in question_list:
+            label_match = re.search(r'\d+', data["label"])
+            if label_match:
+                data["label"] = label_match.group()
+            if data["chapter_title"] == "":
+                data["chapter_title"] = chapter_title
+            try:
+                data["label"] = int(data["label"])
+            except Exception:
+                continue
+            if data["chapter_title"] != "" and data["chapter_title"] != chapter_title:
+                if data["label"] < label:
+                    chapter_id += 1
+                    chapter_title = data["chapter_title"]
+                else:
+                    # 如果题号增加，章节标题却发生变化，说明可能错误提取了子标题。因此继续使用之前的章节标题。
+                    data["chapter_title"] = chapter_title
+            label = data["label"]
+            data["original_chapter_title"] = data["chapter_title"]
+            data["chapter_title"] = refine_title(data["chapter_title"], strict_title_match)
+            if data['label'] > 0:
+                # 已经完整的题目直接写入out_file
+                if data["answer"] or data["solution"]:
+                    already_complete_count += 1
+                    qa_pair = {
+                        "question_chapter_title": data["original_chapter_title"],
+                        "answer_chapter_title": data["original_chapter_title"],
+                        "label": data['label'],
+                        "question": data["question"],
+                        "answer": data["answer"],
+                        "solution": data.get("solution", "")
+                    }
+                    out_file.write(json.dumps(qa_pair, ensure_ascii=False) + '\n')
+                else:
+                    questions[(data["chapter_title"], data['label'])] = data
+        chapter_id = 0
+        chapter_title = ""
+        label = float('inf')
+        for data in answer_list:
+            label_match = re.search(r'\d+', data["label"])
+            if label_match:
+                data["label"] = label_match.group()
+            if data["chapter_title"] == "":
+                data["chapter_title"] = chapter_title
+            try:
+                data["label"] = int(data["label"])
+            except Exception:
+                continue
+            if data["chapter_title"] != "" and data["chapter_title"] != chapter_title:
+                if data["label"] < label:
+                    chapter_id += 1
+                    chapter_title = data["chapter_title"]
+                else:
+                    # 如果题号增加，章节标题却发生变化，说明可能错误提取了子标题。因此继续使用之前的章节标题。
+                    data["chapter_title"] = chapter_title
+            label = data["label"]
+            data["chapter_title"] = refine_title(data["chapter_title"], strict_title_match)
+            # 动态更新，防止错误的重复label覆盖掉之前的solution或answer
+            if data['label'] > 0:
+                if not answers.get((data["chapter_title"], data['label'])):
+                    answers[(data["chapter_title"], data['label'])] = data
+                else:
+                    if not answers[(data["chapter_title"], data['label'])].get("solution") and data.get("solution"):
+                        answers[(data["chapter_title"], data['label'])]["solution"] = data["solution"]
+                    if not answers[(data["chapter_title"], data['label'])].get("answer") and data.get("answer"):
+                        answers[(data["chapter_title"], data['label'])]["answer"] = data["answer"]
+        for label in questions:
+            if label in answers:
+                qa_pair = {
+                    "question_chapter_title": questions[label]["original_chapter_title"],
+                    "answer_chapter_title": answers[label]["original_chapter_title"],
+                    "label": label[1],
+                    "question": questions[label]["question"],
+                    "answer": answers[label]["answer"],
+                    "solution": answers[label].get("solution", "")
+                }
+                out_file.write(json.dumps(qa_pair, ensure_ascii=False) + '\n')
+        print(f"Merged QA pairs: {len(questions.keys() & answers.keys()) + already_complete_count}")
+def jsonl_to_md(jsonl_file, md_file):
+    with open(jsonl_file, 'r', encoding='utf-8') as in_file, open(md_file, 'w', encoding='utf-8') as out_file:
+        for line in in_file:
+            data = json.loads(line)
+            out_file.write(f"### Question {data['label']}\n\n")
+            out_file.write(f"{data['question']}\n\n")
+            out_file.write(f"**Answer:** {data['answer']}\n\n")
+            if data.get('solution'):
+                out_file.write(f"**Solution:**\n\n{data['solution']}\n\n")
+            out_file.write("---\n\n")