Spaces:

Vision-CAIR-Admin
/

Tempo

Running on Zero

App Files Files Community

FeiElysia commited on 16 days ago

Commit

161b19e

1 Parent(s): 5087c07

🚀 Initial deploy

Browse files

Files changed (48) hide show

.gitattributes +2 -0
app.py +384 -0
examples/cover_videomme_FjS2LzrHEO8.png +3 -0
examples/cover_videomme_fFjv93ACGo8.png +3 -0
examples/demo.mp4 +3 -0
examples/demo_cases.json +40 -0
examples/description_honkai3_becauseofyou.png +3 -0
examples/honkai3_becauseofyou.mp4 +3 -0
examples/hsr_helloworld.mp4 +3 -0
examples/lvbench_gXnhqF0TqqI.mp4 +3 -0
examples/meme_hsr_helloworld.png +3 -0
examples/ocr_honkai3_becauseofyou.png +3 -0
examples/performance_hsr_helloworld.png +3 -0
examples/tempo.png +3 -0
examples/tempo.svg +0 -0
examples/videomme_FjS2LzrHEO8.mp4 +3 -0
examples/videomme_FsLaTZmP6Uw.mp4 +3 -0
examples/videomme_Sp2nxlrQ89w.mp4 +3 -0
examples/videomme_fFjv93ACGo8.mp4 +3 -0
packages.txt +1 -0
requirements.txt +11 -0
tempo/__init__.py +6 -0
tempo/__pycache__/__init__.cpython-312.pyc +0 -0
tempo/__pycache__/builder.cpython-312.pyc +0 -0
tempo/__pycache__/constants.cpython-312.pyc +0 -0
tempo/__pycache__/conversation.cpython-312.pyc +0 -0
tempo/__pycache__/mm_datautils.cpython-312.pyc +0 -0
tempo/__pycache__/mm_utils.cpython-312.pyc +0 -0
tempo/__pycache__/tempo_arch.cpython-312.pyc +0 -0
tempo/__pycache__/vlm_multimodal_processor.cpython-312.pyc +0 -0
tempo/builder.py +62 -0
tempo/constants.py +13 -0
tempo/conversation.py +545 -0
tempo/language_model/__pycache__/modeling_tempo_qwen.cpython-312.pyc +0 -0
tempo/language_model/modeling_tempo_qwen.py +231 -0
tempo/mm_datautils.py +1607 -0
tempo/multimodal_encoder/__pycache__/base_encoder.cpython-312.pyc +0 -0
tempo/multimodal_encoder/__pycache__/builder.cpython-312.pyc +0 -0
tempo/multimodal_encoder/__pycache__/qwen3vl_encoder.cpython-312.pyc +0 -0
tempo/multimodal_encoder/__pycache__/siglip_encoder.cpython-312.pyc +0 -0
tempo/multimodal_encoder/base_encoder.py +135 -0
tempo/multimodal_encoder/builder.py +21 -0
tempo/multimodal_encoder/qwen3vl_encoder.py +336 -0
tempo/multimodal_encoder/siglip_encoder.py +75 -0
tempo/multimodal_projector/__pycache__/builder.cpython-312.pyc +0 -0
tempo/multimodal_projector/builder.py +51 -0
tempo/tempo_arch.py +464 -0
tempo/vlm_multimodal_processor.py +332 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,384 @@

+import io
+import os
+import time
+import torch
+import numpy as np
+import gradio as gr
+import multiprocessing
+from decord import cpu, VideoReader
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+import matplotlib.colors as mcolors
+from scipy.interpolate import make_interp_spline
+from PIL import Image
+from tempo.builder import load_pretrained_model
+from tempo.conversation import conv_templates, SeparatorStyle
+from tempo.constants import (
+    DEFAULT_IM_END_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IMAGE_TOKEN,
+    IMAGE_TOKEN_INDEX,
+)
+from tempo.mm_datautils import (
+    compute_segment_timestamp,
+    KeywordsStoppingCriteria,
+    process_qwen_content,
+    tokenizer_image_token,
+)
+import spaces
+from huggingface_hub import snapshot_download
+def get_real_cpu_cores():
+    """use multiple threads for video decoding"""
+    try:
+        # HF Spaces
+        cores = len(os.sched_getaffinity(0))
+    except AttributeError:
+        # Local environments
+        cores = multiprocessing.cpu_count()
+    return cores
+def compute_sample_indices(
+    total_frames: int,
+    original_fps: float,
+    video_fps: float = 2.0,
+    min_frames_num: int = 4,
+    max_frames_num: int = 1024
+) -> list[int]:
+    start_frame, end_frame = 0, total_frames - 1
+    clip_frames = end_frame - start_frame + 1
+    if clip_frames <= 1:
+        return [start_frame]
+    if original_fps is None or original_fps <= 0:
+        original_fps = video_fps
+    clip_duration = clip_frames / original_fps
+    target_num_frames = max(1, round(clip_duration * video_fps))
+    final_num_frames = min(max(target_num_frames, min_frames_num), max_frames_num)
+    if final_num_frames == 1:
+        return [end_frame]
+    indices = np.round(np.linspace(start_frame, end_frame, final_num_frames)).astype(int)
+    indices = np.clip(indices, start_frame, end_frame)
+    return indices.tolist()
+def load_video(video_path: str, video_fps: float = 2.0, max_frames: int = 1024) -> tuple:
+    available_cores = get_real_cpu_cores()
+    optimal_threads = min(max(1, available_cores - 1), 16)
+    print(f"[Profiling] Detected {available_cores} CPU cores. Decord using {optimal_threads} threads.")
+    vr = VideoReader(video_path, ctx=cpu(0), num_threads=optimal_threads)
+    total_frames = len(vr)
+    original_fps = vr.get_avg_fps()
+    frame_idx = compute_sample_indices(total_frames, original_fps, video_fps, max_frames_num=max_frames)
+    images = vr.get_batch(frame_idx).asnumpy()
+    clip_duration = total_frames / original_fps
+    real_fps = len(images) / clip_duration if clip_duration > 0 else video_fps
+    return images, real_fps
+def generate_allocation_plot(allocations):
+    """
+    Token allocation visualization function
+    """
+    if allocations is None or len(allocations) == 0:
+        # if disable_dynamic_compress is True, we return a blank image
+        return Image.new('RGB', (1600, 350), color='white')
+    allocations = np.array(allocations)
+    num_segments = len(allocations)
+    plt.rcParams.update({'font.size': 14, 'font.family': 'serif'})
+    fig = plt.figure(figsize=(16, 3.5), layout='constrained')
+    gs = fig.add_gridspec(2, 1, height_ratios=[0.15, 1.0], hspace=0.05)
+    ax_heat = fig.add_subplot(gs[0])
+    ax_heat.set_title(" ", pad=50)
+    colors = ["#EBF5FB", "#85C1E9", "#F2D7D5", "#E74C3C", "#641E16"]
+    cmap_custom = mcolors.LinearSegmentedColormap.from_list("custom_heat", colors)
+    vmax_val = max(128, allocations.max())
+    ax_heat.imshow([allocations], cmap=cmap_custom, aspect='auto', extent=[0.5, num_segments + 0.5, 0, 1], vmin=4, vmax=vmax_val)
+    ax_heat.set_yticks([])
+    ax_heat.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
+    for spine in ax_heat.spines.values():
+        spine.set_linewidth(1.2)
+    ax_line = fig.add_subplot(gs[1], sharex=ax_heat)
+    x = np.arange(1, num_segments + 1)
+    if num_segments > 3:
+        spl = make_interp_spline(x, allocations, k=3)
+        x_smooth = np.linspace(1, num_segments, 800)
+        y_smooth = spl(x_smooth)
+        y_smooth = np.clip(y_smooth, 4, vmax_val)
+    else:
+        x_smooth = x
+        y_smooth = allocations
+    line_color = '#1A252C'
+    fill_color = '#D5D8DC'
+    ax_line.plot(x_smooth, y_smooth, color=line_color, linewidth=2.0)
+    ax_line.fill_between(x_smooth, y_smooth, color=fill_color, alpha=0.4)
+    ax_line.axhline(vmax_val, color='#C0392B', linestyle='--', linewidth=1.2, alpha=0.8)
+    ax_line.axhline(4, color='#2980B9', linestyle='--', linewidth=1.2, alpha=0.8)
+    ax_line.set_xlim(0.5, num_segments + 0.5)
+    ax_line.set_ylim(0, vmax_val + 12)
+    ax_line.set_ylabel("Tokens / Seg", fontsize=14, fontweight='bold')
+    ax_line.set_xlabel("Temporal Segments", fontsize=14, fontweight='bold')
+    ax_line.xaxis.set_major_locator(ticker.MaxNLocator(integer=True))
+    ax_line.spines['top'].set_visible(False)
+    ax_line.spines['right'].set_visible(False)
+    ax_line.spines['bottom'].set_linewidth(1.2)
+    ax_line.spines['left'].set_linewidth(1.2)
+    ax_line.grid(axis='y', linestyle=':', color='gray', alpha=0.5)
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png', bbox_inches='tight', dpi=100, transparent=True)
+    plt.close(fig)
+    buf.seek(0)
+    return Image.open(buf)
+model_id = "Vision-CAIR/Tempo-6B"
+print(f"[Init] Downloading/Loading weights from {model_id}...")
+MODEL_PATH = snapshot_download(repo_id=model_id)
+print(f"[Init] Loading Tempo model from {MODEL_PATH}...")
+tokenizer, model, image_processor = load_pretrained_model(
+    MODEL_PATH,
+    device_map="cuda",
+    use_flash_attn=True
+)
+FIXED_MAX_LENGTH = 16384
+model.config.tokenizer_model_max_length = FIXED_MAX_LENGTH
+tokenizer.model_max_length = FIXED_MAX_LENGTH
+model.eval()
+model.to(torch.bfloat16)
+print(f"[Init] Model loaded! Max context length set to {FIXED_MAX_LENGTH}.")
+# ==========================================
+#              inference
+# ==========================================
+@spaces.GPU
+def predict(video_path, query, max_frames, visual_token_budget, temperature, max_new_tokens, disable_dynamic_compress):
+    if not video_path:
+        return "⚠️ Error: Please upload a video first."
+    if not query:
+        return "⚠️ Error: Please enter a question."
+    print(f"\n[Request] Video: {video_path} | Query: {query}")
+    model.config.visual_token_budget = int(visual_token_budget)
+    model.get_vision_tower_aux_list()[0].dynamic_compress = not disable_dynamic_compress
+    # video process
+    start_prep_time = time.perf_counter()
+    try:
+        video_frames, real_fps = load_video(video_path, video_fps=2.0, max_frames=int(max_frames))
+    except Exception as e:
+        return f"⚠️ Error loading video: {str(e)}"
+    # process local compressor inputs
+    frame_windows, frame_stride = 8, 8
+    vlm_inputs = process_qwen_content(
+        video_frames, "video", query, image_processor[0], real_fps, frame_windows, frame_stride, is_eval=True
+    )
+    vlm_inputs = {key: v.cuda() for key, v in vlm_inputs.items()}
+    # compute timestamp for each segment
+    seg_timestamps = compute_segment_timestamp(
+        len(vlm_inputs["video_grid_thw"]), tokenizer, real_fps, frame_stride, frame_windows
+    )
+    # stat info
+    num_segments = len(vlm_inputs["video_grid_thw"])
+    segment_duration = frame_windows / real_fps
+    stats_info = f"🎬 Video Stats: Total Segments: {num_segments}  |  Segment Duration: {segment_duration:.2f}s  |  Real FPS: {real_fps:.2f}"
+    # prompt
+    if getattr(model.config, "mm_use_im_start_end", False):
+        qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + "\n" + query
+    else:
+        qs = DEFAULT_IMAGE_TOKEN + "\n" + query
+    conv_version = "qwen"
+    conv = conv_templates[conv_version].copy()
+    conv.append_message(conv.roles[0], qs)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+    # tokenization
+    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
+    stopping_criteria = KeywordsStoppingCriteria([stop_str], tokenizer, input_ids)
+    model._demo_count_allocations = []
+    start_infer_time = time.perf_counter()
+    # generating
+    with torch.inference_mode():
+        output_ids = model.generate(
+            input_ids,
+            images=None, # Qwen-VL architecture usually uses vlm_inputs instead of raw images in kwargs if projector is vlm
+            image_sizes=None,
+            do_sample=(temperature > 0),
+            temperature=temperature if temperature > 0 else None,
+            max_new_tokens=int(max_new_tokens),
+            use_cache=True,
+            stopping_criteria=[stopping_criteria],
+            vlm_inputs=vlm_inputs,
+            seg_timestamps=seg_timestamps,
+        )
+    end_infer_time = time.perf_counter()
+    if isinstance(output_ids, tuple):
+        output_ids = output_ids[0]
+    prep_duration = start_infer_time - start_prep_time
+    infer_duration = end_infer_time - start_infer_time
+    total_duration = end_infer_time - start_prep_time
+    stats_info += f"\n⚡ Profiling  : Prep Time: {prep_duration:.2f}s  |  Inference Time: {infer_duration:.2f}s  |  Total: {total_duration:.2f}s"
+    pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+    if pred.endswith(stop_str):
+        pred = pred[: -len(stop_str)].strip()
+    # token allocation plot
+    allocations_data = model._demo_count_allocations
+    plot_img = generate_allocation_plot(allocations_data)
+    return pred, plot_img, stats_info
+# ==========================================
+#                  UI
+# ==========================================
+with gr.Blocks(title="Tempo Video Understanding", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # ⏱️ Tempo: Small Vision-Language Models are Smart Compressors for Long Video Understanding
+        Upload a video and ask any question! Tempo dynamically compresses visual tokens based on your query to achieve SOTA performance.
+        **[🏠 Project Page](https://feielysia.github.io/)** | **[💻 GitHub](https://github.com/FeiElysia)** | **[📄 Paper](https://arxiv.org/abs/xxxx)** | **[👨‍💻 @Junjie Fei](https://feielysia.github.io/)**
+        *⏳ **Slow preprocessing?** Try Examples 4 & 5 below, decrease `Max Sampled Frames` in Advanced Settings, or check our [GitHub](https://github.com/FeiElysia) for full-speed local deployment.*
+        """
+    )
+    with gr.Row():
+        # left column: inputs
+        with gr.Column(scale=1):
+            video_input = gr.Video(label="Upload Video")
+            example_poster = gr.Image(label="Video Poster", interactive=False, height=150, visible=False)
+            query_input = gr.Textbox(label="Your Question", placeholder="e.g., What is the person doing in the video?", lines=3)
+            with gr.Row():
+                clear_btn = gr.Button("🧹 Clear", variant="secondary")
+                submit_btn = gr.Button("🚀 Generate Response", variant="primary")
+            # hyperparameters
+            with gr.Accordion("Advanced Settings", open=False):
+                max_frames_slider = gr.Slider(minimum=16, maximum=2048, value=1024, step=16, label="Max Sampled Frames")
+                budget_slider = gr.Slider(minimum=64, maximum=16384, value=8192, step=64, label="Visual Token Budget")
+                temp_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, label="Temperature (0 = Greedy)")
+                max_tokens_slider = gr.Slider(minimum=64, maximum=4096, value=1024, step=64, label="Max New Tokens")
+                disable_compress_chk = gr.Checkbox(label="Disable Dynamic Compression (Baseline)", value=False)
+        # right column: outputs
+        with gr.Column(scale=1):
+            output_text = gr.Textbox(label="Tempo Response", lines=12, interactive=False)
+            stats_text = gr.Textbox(label="📊 Video Segment Stats", lines=1, interactive=False)
+            output_plot = gr.Image(label="Query-Aware Visual Feature Intensity (Visual Token Allocation)", interactive=False, height=180)
+    # clicking submit_btn or pressing enter in query_input will trigger prediction
+    submit_btn.click(
+        fn=predict,
+        inputs=[video_input, query_input, max_frames_slider, budget_slider, temp_slider, max_tokens_slider, disable_compress_chk],
+        outputs=[output_text, output_plot, stats_text]
+    )
+    query_input.submit(
+        fn=predict,
+        inputs=[video_input, query_input, max_frames_slider, budget_slider, temp_slider, max_tokens_slider, disable_compress_chk],
+        outputs=[output_text, output_plot, stats_text]
+    )
+    clear_btn.click(
+        fn=lambda: (None, None, None, None, None, None),
+        inputs=None,
+        outputs=[video_input, example_poster, query_input, output_text, stats_text, output_plot]
+    )
+    # Examples
+    gr.Markdown("---")
+    gr.Markdown("### 💡 Try an Example")
+    gr.Examples(
+        examples=[
+            [
+                "examples/hsr_helloworld.mp4",
+                "Task: Please examine the provided media and answer the following three questions regarding the specific puppy in the scene:\n"
+                "Q1: What is the primary fur color of the puppy positioned on the swing?\n"
+                "Q2: Specify the exact time interval (in seconds, e.g., XX-XXs) during which the puppy is seen sitting on the swing.\n"
+                "Q3: Provide a brief description of the puppy's appearance and its surroundings.",
+                "examples/meme_hsr_helloworld.png"
+            ],
+            [
+                "examples/hsr_helloworld.mp4",
+                "Task: Please analyze the provided video and answer the following 7 questions precisely.\n"
+                "Q1: How many performers are visible on the stage?\n"
+                "Q2: Describe the architectural elements in the background. What historical civilization do they remind you of?\n"
+                "Q3: What is happening in the night sky above the performers, and what does this suggest about the event?\n"
+                "Q4: List the hair colors of the performers in order from left to right.\n"
+                "Q5: Identify the specific musical instrument being played by the performer located on the far left of the stage.\n"
+                "Q6: What is the specific time interval (in seconds, e.g., XX-XXs) during which this fireworks performance scene occurs in the video?\n"
+                "Q7: Look at the audience in the foreground. How does their silhouette-like depiction affect the viewer's perspective of the stage?",
+                "examples/performance_hsr_helloworld.png"
+            ],
+            [
+                "examples/honkai3_becauseofyou.mp4",
+                "What text appears in the center of the video behind a sea of pink flowers?",
+                "examples/ocr_honkai3_becauseofyou.png"
+            ],
+            [
+                "examples/videomme_fFjv93ACGo8.mp4",
+                "How many red socks are above the fireplace at the end of this video?",
+                "examples/cover_videomme_fFjv93ACGo8.png"
+            ],
+            [
+                "examples/videomme_FjS2LzrHEO8.mp4",
+                "What was the purpose of using a hammer to hit the car in the video?\n"
+                "A. To show the hammer works well.\n"
+                "B. To show the solidity of the car.\n"
+                "C. To warn people not to hit cars with hammers.\n"
+                "D. To illustrate that a hammer is harder than a bullet.",
+                "examples/cover_videomme_FjS2LzrHEO8.png"
+            ],
+            [
+                "examples/honkai3_becauseofyou.mp4",
+                "Describe the video in detail.",
+                "examples/description_honkai3_becauseofyou.png"
+            ]
+        ],
+        inputs=[video_input, query_input, example_poster],
+        cache_examples=False,
+    )
+if __name__ == "__main__":
+    demo.queue().launch(share=True)

examples/cover_videomme_FjS2LzrHEO8.png ADDED Viewed

Git LFS Details

SHA256: 4dd8043baab3af724d2115de6a90d8015b356a545a617490091b7f98592662f4
Pointer size: 132 Bytes
Size of remote file: 4.01 MB

examples/cover_videomme_fFjv93ACGo8.png ADDED Viewed

Git LFS Details

SHA256: 43d8ae2c4416c09fdda0b305ad3a0264d5c61308d6740067818078e8a64ef79d
Pointer size: 133 Bytes
Size of remote file: 17.4 MB

examples/demo.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec316c8e5fe7f2a62137060c0a35123de75802522816c74478c69dbc329d45da
+size 85447789

examples/demo_cases.json ADDED Viewed

	@@ -0,0 +1,40 @@

+[
+  {
+    "video_path": "./examples/hsr_helloworld.mp4",
+    "query": "Task: Please examine the provided media and answer the following three questions regarding the specific puppy in the scene:\nQ1: What is the primary fur color of the puppy positioned on the swing?\nQ2: Specify the exact time interval (in seconds, e.g., XX-XXs) during which the puppy is seen sitting on the swing.\nQ3: Provide a brief description of the puppy's appearance and its surroundings."
+  },
+  {
+    "video_path": "./examples/hsr_helloworld.mp4",
+    "query": "Task: Please analyze the provided video and answer the following 7 questions precisely.\nQ1: How many performers are visible on the stage?\nQ2: Describe the architectural elements in the background. What historical civilization do they remind you of?\nQ3: What is happening in the night sky above the performers, and what does this suggest about the event?\nQ4: List the hair colors of the performers in order from left to right.\nQ5: Identify the specific musical instrument being played by the performer located on the far left of the stage.\nQ6: What is the specific time interval (in seconds, e.g., XX-XXs) during which this fireworks performance scene occurs in the video?\nQ7: Look at the audience in the foreground. How does their silhouette-like depiction affect the viewer's perspective of the stage?"
+  },
+  {
+    "video_path": "./examples/honkai3_becauseofyou.mp4",
+    "query": "What text appears in the center of the video behind a sea of pink flowers?"
+  },
+  {
+    "video_path": "./examples/honkai3_becauseofyou.mp4",
+    "query": "Describe the video in detail."
+  },
+  {
+    "video_path": "./examples/videomme_fFjv93ACGo8.mp4",
+    "query": "What colors are the clothes worn by the two announcers in the studio?"
+  },
+  {
+    "video_path": "./examples/videomme_FjS2LzrHEO8.mp4",
+    "query": "What was the purpose of using a hammer to hit the car in the video?\nA. To show the hammer works well.\nB. To show the solidity of the car.\nC. To warn people not to hit cars with hammers.\nD. To illustrate that a hammer is harder than a bullet."
+  },
+  {
+    "video_path": "./examples/videomme_FsLaTZmP6Uw.mp4",
+    "query": "Which year was the game held?"
+  },
+  {
+    "video_path": "./examples/videomme_Sp2nxlrQ89w.mp4",
+    "query": "In line with the video evidence, why does the orange stickman want to destroy the Minecraft world?\nA. He wants to save his son.\nB. He is too sad.\nC. He loses his son.\nD. He does like the world."
+  },
+  {
+    "video_path": "./examples/lvbench_gXnhqF0TqqI.mp4",
+    "query": "Where are the woman and children when they first appear in the video?"
+  }
+]

examples/description_honkai3_becauseofyou.png ADDED Viewed

Git LFS Details

SHA256: 29dff9fea96a3f57f539da37c40aba481adee56082bd2ab0cc695f319ade3a76
Pointer size: 132 Bytes
Size of remote file: 5.9 MB

examples/honkai3_becauseofyou.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12f943f66683a6a5a49888d4651dd868d75ce4be4fc35ea5af0853b5921de62f
+size 43919755

examples/hsr_helloworld.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd2fb2c3a84719bb586925e68acb70de282020268a42bfef53635b88c8e6afcd
+size 17860778

examples/lvbench_gXnhqF0TqqI.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7135161f9b45bf968ec320cc27b3c381593f47dbd8f17013a11a59da50980315
+size 276007868

examples/meme_hsr_helloworld.png ADDED Viewed

Git LFS Details

SHA256: 4adcaa6d15600f1cb128e84c3524fff4ab333d5239f4a38c2a576663e2472a82
Pointer size: 132 Bytes
Size of remote file: 8.77 MB

examples/ocr_honkai3_becauseofyou.png ADDED Viewed

Git LFS Details

SHA256: c7c6b756d1ec0b6cf2797a0b229c99c19feb436b0cf0e27378f84baf7853cdca
Pointer size: 131 Bytes
Size of remote file: 456 kB

examples/performance_hsr_helloworld.png ADDED Viewed

Git LFS Details

SHA256: 42cbe2fd725ba17dc380748680b706cb0d070716c165953e89848e0d540d999f
Pointer size: 133 Bytes
Size of remote file: 12.4 MB

examples/tempo.png ADDED Viewed

Git LFS Details

SHA256: da600c7bca0b107f1d7696cb2b8876d6ecfdda8742bf966751e99035c1cfa0fa
Pointer size: 131 Bytes
Size of remote file: 863 kB

examples/tempo.svg ADDED Viewed

examples/videomme_FjS2LzrHEO8.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43b9cb496d7e3c56b9de35e777df206f84c99a907bdbd569a788ba098ffd46e8
+size 6328020

examples/videomme_FsLaTZmP6Uw.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c68ee0eab9c6ef905de336222e127e7b393e0a5d862c2cdc5997f50b3b130d48
+size 8838644

examples/videomme_Sp2nxlrQ89w.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:67436aa59d44434e2b115bc5d6e07ea7e06a6de1bdfaea7c91ca320fbdd787b3
+size 136659511

examples/videomme_fFjv93ACGo8.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba82f02cf6acc6a25c6efc4783fc2252c65afbedb8dbc4c7148af08834fdf999
+size 17126035

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+spaces
+qwen-vl-utils==0.0.14
+transformers==4.57.1
+accelerate==1.13.0
+gradio
+matplotlib
+scipy
+huggingface_hub
+decord
+av
+flash-attn==2.7.4.post1

tempo/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from tempo.language_model.modeling_tempo_qwen import TempoConfig, TempoQwenForCausalLM
+__all__ = [
+    "TempoConfig",
+    "TempoQwenForCausalLM",
+]

tempo/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (305 Bytes). View file

tempo/__pycache__/builder.cpython-312.pyc ADDED Viewed

Binary file (2.14 kB). View file

tempo/__pycache__/constants.cpython-312.pyc ADDED Viewed

Binary file (556 Bytes). View file

tempo/__pycache__/conversation.cpython-312.pyc ADDED Viewed

Binary file (20.3 kB). View file

tempo/__pycache__/mm_datautils.cpython-312.pyc ADDED Viewed

Binary file (57.1 kB). View file

tempo/__pycache__/mm_utils.cpython-312.pyc ADDED Viewed

Binary file (3.07 kB). View file

tempo/__pycache__/tempo_arch.cpython-312.pyc ADDED Viewed

Binary file (17.5 kB). View file

tempo/__pycache__/vlm_multimodal_processor.cpython-312.pyc ADDED Viewed

Binary file (14.1 kB). View file

tempo/builder.py ADDED Viewed

	@@ -0,0 +1,62 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import torch
+from tempo.constants import (
+    DEFAULT_IM_END_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IMAGE_PATCH_TOKEN,
+)
+from transformers import AutoTokenizer
+from tempo.language_model.modeling_tempo_qwen import TempoQwenForCausalLM
+def load_pretrained_model(
+    model_path,
+    device_map="auto",
+    device="cuda",
+    use_flash_attn=False,
+    **kwargs,
+):
+    kwargs = {"device_map": device_map, **kwargs}
+    if device != "cuda":
+        kwargs["device_map"] = {"": device}
+    kwargs["dtype"] = torch.float16
+    if use_flash_attn:
+        kwargs["attn_implementation"] = "flash_attention_2"
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    model = TempoQwenForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+    add_tokens_flag = False
+    if getattr(model.config, "mm_use_im_patch_token", False):
+        add_tokens_flag = True
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+    if getattr(model.config, "mm_use_im_start_end", False):
+        add_tokens_flag = True
+        tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+    if add_tokens_flag:
+        model.resize_token_embeddings(len(tokenizer))
+    vision_tower_aux_list = model.get_vision_tower_aux_list()
+    for vision_tower_aux in vision_tower_aux_list:
+        if not vision_tower_aux.is_loaded:
+            vision_tower_aux.load_model(device_map=device_map)
+        vision_tower_aux.to(device=device, dtype=torch.float16)
+    image_processor = None
+    image_processor = [vision_tower_aux.image_processor for vision_tower_aux in vision_tower_aux_list]
+    return tokenizer, model, image_processor

tempo/constants.py ADDED Viewed

	@@ -0,0 +1,13 @@

+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"

tempo/conversation.py ADDED Viewed

	@@ -0,0 +1,545 @@

+import base64
+import dataclasses
+from io import BytesIO
+from enum import auto, Enum
+from typing import Any, Union
+from PIL import Image
+from transformers import AutoTokenizer
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+    LLAMA_3 = auto()
+    LLAMA_3_1 = auto()
+    LLAMA_3_2 = auto()
+    QWEN = auto()
+    CHATML = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: list[str]
+    messages: list[list[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    tokenizer: Any = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: Union[str, list[str]] = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: list[int] = None
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if "mmtag" in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = "" if self.system == "" else self.system + self.sep + "\n"
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, images, _ = message
+                        message = "<image>" * len(images) + message
+                    ret += role + "\n" + message + self.sep + "\n"
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: (
+                f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
+            )
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0:
+                        message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.LLAMA_3:
+            if self.tokenizer is None:
+                self.tokenizer = AutoTokenizer.from_pretrained("//path/to/llama3/tokenizer")
+            chat_template_messages = [{"role": "system", "content": self.system}]
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, images = message
+                        message = "<image>" * len(images) + message
+                    chat_template_messages.append({"role": role, "content": message})
+            return self.tokenizer.apply_chat_template(
+                chat_template_messages, tokenize=False, add_generation_prompt=True
+            )
+        elif self.sep_style == SeparatorStyle.LLAMA_3_1:
+            if self.tokenizer is None:
+                self.tokenizer = AutoTokenizer.from_pretrained("//path/to/llama3.1/tokenizer")
+            chat_template_messages = [{"role": "system", "content": self.system}]
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, images = message
+                        message = "<image>" * len(images) + message
+                    chat_template_messages.append({"role": role, "content": message})
+            return self.tokenizer.apply_chat_template(
+                chat_template_messages, tokenize=False, add_generation_prompt=False
+            )
+        elif self.sep_style == SeparatorStyle.LLAMA_3_2:
+            wrap_sys = lambda msg: f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>{msg}<|eot_id|>" if len(msg) > 0 else msg
+            wrap_inst_user = lambda msg: f"<|start_header_id|>user<|end_header_id|>{msg}<|eot_id|>"
+            wrap_inst_assistant = lambda msg: f"<|start_header_id|>assistant<|end_header_id|>{msg}<|eot_id|>"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0:
+                        ret += wrap_sys(self.system)
+                    if i % 2 == 0:
+                        message = wrap_inst_user(message)
+                        ret += message
+                    else:
+                        message = wrap_inst_assistant(message)
+                        ret += message
+                else:
+                    ret += ""
+            ret += "<|start_header_id|>assistant<|end_header_id|>"
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def process_image(
+        self,
+        image,
+        image_process_mode,
+        return_pil=False,
+        image_format="PNG",
+        max_len=1344,
+        min_len=672,
+    ):
+        if image_process_mode == "Pad":
+            def expand2square(pil_img, background_color=(122, 116, 104)):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+            image = expand2square(image)
+        elif image_process_mode in ["Default", "Crop"]:
+            pass
+        elif image_process_mode == "Resize":
+            image = image.resize((336, 336))
+        else:
+            raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+        if max(image.size) > max_len:
+            max_hw, min_hw = max(image.size), min(image.size)
+            aspect_ratio = max_hw / min_hw
+            shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+            longest_edge = int(shortest_edge * aspect_ratio)
+            W, H = image.size
+            if H > W:
+                H, W = longest_edge, shortest_edge
+            else:
+                H, W = shortest_edge, longest_edge
+            image = image.resize((W, H))
+        if return_pil:
+            return image
+        else:
+            buffered = BytesIO()
+            image.save(buffered, format=image_format)
+            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+            return img_b64_str
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    image = self.process_image(
+                        image, image_process_mode, return_pil=return_pil
+                    )
+                    images.append(image)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    img_b64_str = self.process_image(
+                        image, "Default", return_pil=False, image_format="JPEG"
+                    )
+                    img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace("<image>", "").strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version,
+        )
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [
+                    [x, y[0] if type(y) is tuple else y] for x, y in self.messages
+                ],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        (
+            "Human",
+            "What are the key differences between renewable and non-renewable energy sources?",
+        ),
+        (
+            "Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n",
+        ),
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+    "You are able to understand the visual content that the user provides, "
+    "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+    version="plain",
+)
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+    "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+    "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+conv_mistral_instruct = Conversation(
+    system="",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="",
+    sep2="</s>",
+)
+conv_chatml_direct = Conversation(
+    system="""<|im_start|>system
+Answer the questions.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_llama3 = Conversation(
+    system="""You are a helpful assistant.""",
+    roles=("user", "assistant"),
+    version="llama3",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_3,
+    sep="<|eot_id|>",
+)
+conv_llama3_2 = Conversation(
+    system="""You are a helpful assistant.""",
+    roles=("user", "assistant"),
+    version="llama3_2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_3_2,
+    sep="<|eot_id|>",
+)
+conv_phi3_instruct = Conversation(
+    system="""<|system|>\nYou are a helpful AI assistant.""",
+    roles=("\n<|user|>\n", "\n<|assistant|>\n"),
+    version="phi3",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|end|>",
+)
+conv_qwen = Conversation(
+    system="""<|im_start|>system
+You are a helpful assistant.""",
+    roles=("<|im_start|>user", "<|im_start|>assistant"),
+    version="qwen",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.CHATML,
+    sep="<|im_end|>",
+)
+default_conversation = conv_qwen
+conv_templates = {
+    "default": conv_vicuna_v0,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "llama_2": conv_llama_2,
+    "mistral_instruct": conv_mistral_instruct,
+    "chatml_direct": conv_chatml_direct,
+    "mistral_direct": conv_chatml_direct,
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "llava_v0": conv_llava_v0,
+    "v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+    "mpt": conv_mpt,
+    "llama3": conv_llama3,
+    "llama3_2": conv_llama3_2,
+    "phi3": conv_phi3_instruct,
+    "qwen": conv_qwen,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

tempo/language_model/__pycache__/modeling_tempo_qwen.cpython-312.pyc ADDED Viewed

Binary file (7.46 kB). View file

tempo/language_model/modeling_tempo_qwen.py ADDED Viewed

	@@ -0,0 +1,231 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import torch
+import torch.nn as nn
+from typing import Optional, Union, Callable
+from transformers.utils import logging
+from transformers.cache_utils import Cache
+from transformers import AutoConfig, AutoModelForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers import Qwen3Config, Qwen3ForCausalLM, Qwen3Model
+from transformers.modeling_utils import PreTrainedModel
+from transformers.generation.streamers import BaseStreamer
+from transformers.generation.utils import (
+    GenerateOutput,
+    GenerationConfig,
+    LogitsProcessorList,
+    StoppingCriteriaList,
+)
+from tempo.tempo_arch import TempoMetaForCausalLM, TempoMetaModel
+logger = logging.get_logger(__name__)
+class TempoConfig(Qwen3Config):
+    model_type = "tempo_qwen"
+    debug = "debug"
+class TempoQwenModel(TempoMetaModel, Qwen3Model):
+    config_class = TempoConfig
+    def __init__(self, config: Qwen3Config):
+        super(TempoQwenModel, self).__init__(config)
+class TempoQwenForCausalLM(Qwen3ForCausalLM, TempoMetaForCausalLM):
+    config_class = TempoConfig
+    def __init__(self, config):
+        super(Qwen3ForCausalLM, self).__init__(config)
+        config.model_type = "tempo_qwen"
+        config.rope_scaling = None
+        self.model = TempoQwenModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        images: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[list[list[int]]] = None,
+        vlm_inputs: Optional[dict] = None,
+        seg_timestamps: Optional[torch.LongTensor] = None,
+        batch_split_size: Optional[list[int]] = None,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                image_sizes,
+                vlm_inputs,
+                seg_timestamps,
+                batch_split_size,
+            )
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
+        synced_gpus: Optional[bool] = None,
+        assistant_model: Optional["PreTrainedModel"] = None,
+        streamer: Optional["BaseStreamer"] = None,
+        negative_prompt_ids: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        use_model_defaults: Optional[bool] = None,
+        custom_generate: Optional[Union[str, Callable]] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        vlm_inputs = kwargs.pop("vlm_inputs", None)
+        seg_timestamps = kwargs.pop("seg_timestamps", None)
+        relevance = kwargs.pop("relevance", None) # when using external retriever
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        if vlm_inputs is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                image_sizes=image_sizes,
+                vlm_inputs=vlm_inputs,
+                seg_timestamps=seg_timestamps,
+                relevance=relevance,
+            )
+        elif images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                image_sizes=image_sizes,
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+        # if attention_mask is None:
+        #     # avoid warning
+        #     attention_mask = torch.ones(
+        #         inputs_embeds.shape[:2],
+        #         dtype=torch.long,
+        #         device=inputs_embeds.device
+        #     )
+        return super().generate(
+            inputs=None,
+            generation_config=generation_config,
+            logits_processor=logits_processor,
+            stopping_criteria=stopping_criteria,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            synced_gpus=synced_gpus,
+            assistant_model=assistant_model,
+            streamer=streamer,
+            negative_prompt_ids=negative_prompt_ids,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            use_model_defaults=use_model_defaults,
+            custom_generate=custom_generate,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
+    ):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+        if images is not None:
+            inputs["images"] = images
+        if image_sizes is not None:
+            inputs["image_sizes"] = image_sizes
+        return inputs
+AutoConfig.register("tempo_qwen", TempoConfig)
+AutoModelForCausalLM.register(TempoConfig, TempoQwenForCausalLM)

tempo/mm_datautils.py ADDED Viewed

	@@ -0,0 +1,1607 @@

+import os
+import copy
+from typing import List
+from packaging import version
+from collections.abc import Sequence
+import torch
+import tokenizers
+import transformers
+import numpy as np
+from PIL import Image
+from transformers import StoppingCriteria
+from qwen_vl_utils import process_vision_info
+from torch import distributed as dist
+from torch.distributed.fsdp import (
+    FullStateDictConfig,
+    FullyShardedDataParallel as FSDP,
+    StateDictType,
+)
+from tempo import conversation as conversation_lib
+from tempo.constants import (
+    DEFAULT_IM_END_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IMAGE_TOKEN,
+    IGNORE_INDEX,
+    IMAGE_TOKEN_INDEX,
+)
+IS_TOKENIZER_GREATER_THAN_0_14 = version.parse(tokenizers.__version__) >= version.parse(
+    "0.14"
+)
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        self.max_keyword_len = 0
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if (
+                len(cur_keyword_ids) > 1
+                and cur_keyword_ids[0] == tokenizer.bos_token_id
+            ):
+                cur_keyword_ids = cur_keyword_ids[1:]
+            if len(cur_keyword_ids) > self.max_keyword_len:
+                self.max_keyword_len = len(cur_keyword_ids)
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def call_for_batch(
+        self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+    ) -> bool:
+        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
+        self.keyword_ids = [
+            keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids
+        ]
+        for keyword_id in self.keyword_ids:
+            truncated_output_ids = output_ids[0, -keyword_id.shape[0] :]
+            if torch.equal(truncated_output_ids, keyword_id):
+                return True
+        outputs = self.tokenizer.batch_decode(
+            output_ids[:, -offset:], skip_special_tokens=True
+        )[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+    def __call__(
+        self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+    ) -> bool:
+        outputs = []
+        for i in range(output_ids.shape[0]):
+            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
+        return all(outputs)
+def safe_save_model_for_hf_trainer(
+    trainer: transformers.Trainer, output_dir: str
+) -> None:
+    """Collects the state dict and dump to disk."""
+    global_rank = dist.get_rank()
+    save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+    if len(trainer.args.fsdp) == 0:
+        cpu_state_dict = trainer.model.state_dict()
+    else:
+        with FSDP.state_dict_type(
+            trainer.model, StateDictType.FULL_STATE_DICT, save_policy
+        ):
+            cpu_state_dict = trainer.model.state_dict()
+    for key in cpu_state_dict.keys():
+        cpu_state_dict[key] = cpu_state_dict[key].to(torch.bfloat16)
+    if global_rank == 0:
+        trainer.model.config.save_pretrained(output_dir)
+        current_folder = output_dir.split("/")[-1]
+        parent_folder = os.path.dirname(output_dir)
+        save_path = os.path.join(output_dir, "pytorch_model.bin")
+        if getattr(trainer.args, "tune_mm_mlp_adapter", False) and not getattr(
+            trainer.args, "tune_text_decoder", False
+        ):
+            # Only save Adapter
+            keys_to_match = ["mm_projector"]
+            if getattr(trainer.args, "use_im_start_end", False):
+                keys_to_match.extend(["embed_tokens", "embed_in"])
+            freeze_layer_remove = []
+            for key in cpu_state_dict.keys():
+                remove = True
+                for key_match in keys_to_match:
+                    if key_match in key:
+                        remove = False
+                        break
+                if remove:
+                    freeze_layer_remove.append(key)
+            for key in freeze_layer_remove:
+                del cpu_state_dict[key]
+            if current_folder.startswith("checkpoint-"):
+                mm_projector_folder = os.path.join(parent_folder, "mm_projector")
+                os.makedirs(mm_projector_folder, exist_ok=True)
+                save_path = os.path.join(mm_projector_folder, f"{current_folder}.bin")
+            else:
+                save_path = os.path.join(output_dir, f"mm_projector.bin")
+        torch.save(cpu_state_dict, save_path)
+def _tokenize_fn(
+    strings: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> dict:
+    """Tokenize a list of strings."""
+    tokenized_list = [
+        tokenizer(
+            text,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        )
+        for text in strings
+    ]
+    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
+    input_ids_lens = labels_lens = [
+        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
+        for tokenized in tokenized_list
+    ]
+    return dict(
+        input_ids=input_ids,
+        labels=labels,
+        input_ids_lens=input_ids_lens,
+        labels_lens=labels_lens,
+    )
+def _mask_targets(target, tokenized_lens, speakers) -> None:
+    # cur_idx = 0
+    cur_idx = tokenized_lens[0]
+    tokenized_lens = tokenized_lens[1:]
+    target[:cur_idx] = IGNORE_INDEX
+    for tokenized_len, speaker in zip(tokenized_lens, speakers):
+        if speaker == "human":
+            target[cur_idx + 2 : cur_idx + tokenized_len] = IGNORE_INDEX
+        cur_idx += tokenized_len
+def _add_speaker_and_signal(header, source, get_conversation: bool = True):
+    """Add speaker and start/end signal on each round."""
+    BEGIN_SIGNAL = "### "
+    END_SIGNAL = "\n"
+    conversation = header
+    for sentence in source:
+        from_str = sentence["from"]
+        if from_str.lower() == "human":
+            from_str = conversation_lib.default_conversation.roles[0]
+        elif from_str.lower() == "gpt":
+            from_str = conversation_lib.default_conversation.roles[1]
+        else:
+            from_str = "unknown"
+        sentence["value"] = (
+            BEGIN_SIGNAL + from_str + ": " + sentence["value"] + END_SIGNAL
+        )
+        if get_conversation:
+            conversation += sentence["value"]
+    conversation += BEGIN_SIGNAL
+    return conversation
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def crop2square(pil_img):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        left = (width - height) // 2
+        right = left + height
+        top = 0
+        bottom = height
+        return pil_img.crop((left, top, right, bottom))
+    else:
+        top = (height - width) // 2
+        bottom = top + width
+        left = 0
+        right = width
+        return pil_img.crop((left, top, right, bottom))
+def perpare_input_for_qwen_input(chunk_dict, pad_token_ids):
+    """Currently, only batch size = 1 is supported for evaluation."""
+    qwenvl_input_dict = {}
+    has_video = any(["video" in key for key in list(chunk_dict.keys())])
+    qwenvl_input_dict["input_ids"] = torch.nn.utils.rnn.pad_sequence(
+        chunk_dict["vlm_input_ids"],
+        batch_first=True,
+        padding_value=pad_token_ids,
+    )
+    qwenvl_input_dict["attention_mask"] = torch.nn.utils.rnn.pad_sequence(
+        chunk_dict["vlm_attention_mask"],
+        batch_first=True,
+        padding_value=0,
+    )
+    if has_video:
+        qwenvl_input_dict["pixel_values_videos"] = torch.cat(chunk_dict["pixel_values_videos"], dim=0)
+        qwenvl_input_dict["video_grid_thw"] = torch.cat(chunk_dict["video_grid_thw"], dim=0)
+    else:
+        qwenvl_input_dict["pixel_values"] = torch.cat(chunk_dict["pixel_values"], dim=0)
+        qwenvl_input_dict["image_grid_thw"] = torch.cat(chunk_dict["image_grid_thw"], dim=0)
+    return qwenvl_input_dict
+def construct_message(
+    content_data,
+    data_type,
+    query,
+    multimodal_processor,
+    sample_fps=1,
+    return_text=False,
+):
+    # # prompt 0 (using during training)
+    # system_message = (
+    #     "You are a query-conditioned visual compressor. "
+    #     "Store in the provided memory tokens the minimal visual information needed to answer the Query. "
+    #     "Ignore irrelevant details."
+    # )
+    # prompt 1 (using during inference)
+    system_message = (
+        "You are a query-conditioned visual compressor. "
+        "Store in the provided memory tokens the minimal visual information needed to answer the Query. "
+        "Ignore irrelevant details. "
+        "Now, before compressing, answer exactly 'Yes' or 'No': is this segment relevant to the Query?"
+    )
+    user_message = f"\nQuery:\n{query}"
+    # assistant_message = "Scanning for target features... The visual confidence representation is:"
+    assistant_message = None
+    if data_type == "image":
+        messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": system_message}],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": content_data},
+                    {"type": "text", "text": user_message},
+                ],
+            },
+        ]
+    elif data_type == "video":
+        messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": system_message}],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video",
+                        "video": content_data,
+                        "sample_fps": sample_fps,
+                    },
+                    {"type": "text", "text": user_message},
+                ],
+            },
+        ]
+    else:
+        raise ValueError(f"Unknown data type: {data_type}")
+    if return_text:
+        messages = multimodal_processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        if assistant_message is not None:
+            messages = messages + assistant_message
+    return messages
+def video_process_with_frame_idx(
+    chunk_frames, multimodal_processor, query, sample_fps=2, frame_offset=0
+):
+    messages = construct_message(
+        chunk_frames, "video", query, multimodal_processor, sample_fps
+    )
+    text = multimodal_processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs, video_kwargs = process_vision_info(
+        [messages],
+        return_video_kwargs=True,
+        image_patch_size=16,
+        return_video_metadata=True,
+    )
+    if video_inputs is not None:
+        video_inputs, video_metadatas = zip(*video_inputs)
+        video_inputs, video_metadatas = (
+            list(video_inputs),
+            list(video_metadatas),
+        )
+    else:
+        video_metadatas = None
+    if video_metadatas is not None:
+        video_metadatas[0]["frames_indices"] = [
+            f + frame_offset for f in video_metadatas[0]["frames_indices"]
+        ]
+    inputs = multimodal_processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        video_metadata=video_metadatas,
+        **video_kwargs,
+        do_resize=False,
+        return_tensors="pt",
+    )
+    return inputs
+def process_qwen_content(
+    content_data,
+    data_type,
+    sources,
+    multimodal_processor,
+    real_fps=None,
+    frame_windows=8,
+    frame_stride=8,
+    is_eval=False,
+):
+    """
+    content_data:
+        - 'image': PIL.Image
+        - 'video': List[PIL.Image] or np.ndarray (T, H, W, C)
+    data_type: 'text', 'image' or 'video'
+    """
+    # query process
+    if is_eval:
+        # for evaluation, please input only text string
+        assert isinstance(sources, str), "During evaluation, sources should be a single query string."
+        query = sources
+    else:
+        conversation = sources[0]
+        if isinstance(conversation, list):
+            # This is acceptable during training to learn better representations,
+            # but cannot be used during inference as it may lead to data leakage.
+            # Currently, only single-turn dialogue is supported during inference.
+            # Set the maximum number of dialogue turns to 8.
+            human_queries = []
+            for turn in conversation[:16]:
+                if turn.get("from") == "human":
+                    clean_text = turn["value"].replace("<image>", "").strip()
+                    if clean_text:
+                        human_queries.append(clean_text)
+            query = "\n".join(
+                [f"Context turn {i + 1}: {q}" for i, q in enumerate(human_queries)]
+            )
+        else:
+            query = "Describe this content."
+    if not query.strip():
+        query = "Describe this content."
+    chunk_results = []
+    def resize_images(frames, resolution=512):
+        resized_frames = []
+        for f in frames:
+            w, h = f.size
+            max_edge = max(w, h)
+            if max_edge > resolution:
+                ratio = resolution / max_edge
+                new_w = int(round((w * ratio) / 16) * 16)
+                new_h = int(round((h * ratio) / 16) * 16)
+                new_w = max(16, new_w)
+                new_h = max(16, new_h)
+                f = f.resize((new_w, new_h), resample=Image.Resampling.BICUBIC)
+            resized_frames.append(f)
+        return resized_frames
+    # === Text ===
+    if data_type == "text":
+        content_data = Image.new("RGB", (336, 336), color=(255, 255, 255))  # dummy image
+        messages = construct_message(
+            content_data, "image", query, multimodal_processor, return_text=True
+        )  # str
+        inputs = multimodal_processor(
+            text=[messages],
+            images=[content_data],
+            padding=False,
+            return_tensors="pt",
+        )
+        chunk_results.append(inputs)
+    # === Image ===
+    elif data_type == "image":
+        if isinstance(content_data, list):
+            # multi-image
+            content_data = resize_images(content_data, resolution=512)
+            messages = construct_message(
+                content_data[0], data_type, query, multimodal_processor, return_text=True,
+            )  # str
+            inputs = multimodal_processor(
+                text=[messages] * len(content_data),
+                images=content_data,
+                padding=True,
+                return_tensors="pt",
+            )
+        else:
+            messages = construct_message(
+                content_data, data_type, query, multimodal_processor, return_text=True
+            )  # str
+            inputs = multimodal_processor(
+                text=[messages],
+                images=[content_data],
+                padding=True,
+                return_tensors="pt",
+            )
+        chunk_results.append(inputs)
+    # === Video ===
+    elif data_type == "video":
+        if isinstance(content_data, np.ndarray):
+            frames = [Image.fromarray(f) for f in content_data]
+        else:
+            frames = content_data
+        if frames:
+            frames = resize_images(frames, resolution=512)
+        total_frames = len(frames)
+        window_size = frame_windows
+        stride = frame_stride
+        for i in range(0, total_frames, stride):
+            start_idx = i
+            frame_offset = i  # use to compute timestamp
+            end_idx = min(start_idx + window_size, total_frames)
+            chunk_frames = frames[start_idx:end_idx]
+            # if len(chunk_frames) < window_size:
+            #     chunk_frames.extend(
+            #         [chunk_frames[-1]] * (window_size - len(chunk_frames))
+            #     )
+            if len(chunk_frames) < window_size and len(chunk_frames) == 1:
+                chunk_frames.append(chunk_frames[-1])
+                print(f"Qwen processor requires at least 2 frames as video input, copy last frame to {len(chunk_frames)}")
+            inputs = video_process_with_frame_idx(
+                chunk_frames, multimodal_processor, query, real_fps, frame_offset,
+            )
+            chunk_results.append(inputs)
+    else:
+        raise ValueError(f"Unknown data type: {data_type}")
+    # ============Group for batch process===================
+    chunk_dict = {}
+    for key in chunk_results[0]:
+        if key in ["input_ids", "attention_mask"]:
+            chunk_dict[f"vlm_{key}"] = [r[key].squeeze(dim=0) for r in chunk_results]
+        else:
+            chunk_dict[key] = [r[key] for r in chunk_results]
+    if is_eval:
+        return perpare_input_for_qwen_input(
+            chunk_dict, multimodal_processor.tokenizer.pad_token_id
+        )
+    return chunk_dict
+def compute_segment_timestamp(
+    num_segments,
+    tokenizer,
+    real_fps,
+    stride=None,
+    window_size=None,
+    use_center_timestamp=True,
+):
+    """
+    The current version only supports non-overlapping segments.
+    You need to modify the timestamp computation to support overlapping segments.
+    """
+    step = stride if stride is not None else window_size
+    fps = real_fps if real_fps and real_fps > 0 else 1.0
+    seg_timestamps_ids = []
+    for i in range(num_segments):
+        start_frame_idx = i * step
+        if use_center_timestamp:
+            frame_idx = start_frame_idx + (window_size / 2)
+        else:
+            frame_idx = start_frame_idx
+        cur_timestamp_sec = frame_idx / fps
+        text = f"<{cur_timestamp_sec:.1f} seconds>"
+        ids = tokenizer.encode(text, add_special_tokens=False)
+        seg_timestamps_ids.append(ids)
+    return seg_timestamps_ids
+def compute_sample_indices(
+    total_frames: int,
+    original_fps: float,
+    target_fps: float,
+    min_frames: int,
+    max_frames: int,
+) -> List[int]:
+    if total_frames <= 1:
+        return [0]
+    if original_fps is None or original_fps <= 0:
+        original_fps = target_fps
+    video_duration = total_frames / original_fps
+    target_num_frames = max(1, round(video_duration * target_fps))
+    final_num_frames = target_num_frames
+    if final_num_frames < min_frames:
+        print(
+            f"Upsampling video from {target_num_frames} to {min_frames} frames (min_frames limit)."
+        )
+        final_num_frames = min_frames
+    elif final_num_frames > max_frames:
+        print(
+            f"Downsampling video from {target_num_frames} to {max_frames} frames (max_frames limit)."
+        )
+        final_num_frames = max_frames
+    if final_num_frames == 1:
+        return [total_frames - 1]
+    indices = np.linspace(0, total_frames - 1, final_num_frames).astype(int)
+    indices = np.clip(indices, 0, total_frames - 1)
+    return indices.tolist()
+def process_images(images, image_processor, model_cfg):
+    # if image_processor is None:
+    #     raise ValueError("image_processor cannot be None")
+    if isinstance(image_processor, list):
+        image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+        processor_aux_list = image_processor
+        new_images_aux_list = []
+        for image in images:
+            if isinstance(image, np.ndarray):
+                image = Image.fromarray(image)
+            image_aux_list = []
+            for processor_aux in processor_aux_list:
+                image_aux = image
+                if hasattr(processor_aux, "image_mean"):
+                    try:
+                        target_resolution = processor_aux.crop_size["height"]
+                    except:
+                        target_resolution = processor_aux.size["height"]
+                    # image_aux = expand2square(
+                    #     image_aux, tuple(int(x * 255) for x in processor_aux.image_mean)
+                    # ).resize((target_resolution, target_resolution))
+                    if image_aspect_ratio == "pad":
+                        image_aux = expand2square(
+                            image_aux,
+                            tuple(int(x * 255) for x in processor_aux.image_mean),
+                        )
+                    elif image_aspect_ratio == "crop":
+                        image_aux = crop2square(image_aux)
+                    image_aux = image_aux.resize((target_resolution, target_resolution))
+                image_aux = processor_aux.preprocess(image_aux, return_tensors="pt")[
+                    "pixel_values"
+                ][0]
+                image_aux_list.append(image_aux)
+            new_images_aux_list.append(image_aux_list)
+        new_images_aux_list = [
+            list(batch_image_aux) for batch_image_aux in zip(*new_images_aux_list)
+        ]
+        new_images_aux_list = [
+            torch.stack(image_aux).half().cuda() for image_aux in new_images_aux_list
+        ]
+        return new_images_aux_list
+    else:
+        image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+        new_images = []
+        if image_aspect_ratio == "pad":
+            for image in images:
+                image = expand2square(
+                    image, tuple(int(x * 255) for x in image_processor.image_mean)
+                )
+                image = image_processor.preprocess(image, return_tensors="pt")[
+                    "pixel_values"
+                ][0]
+                new_images.append(image)
+        elif image_aspect_ratio == "crop":
+            for image in images:
+                image = crop2square(image)
+                image = image_processor.preprocess(image, return_tensors="pt")[
+                    "pixel_values"
+                ][0]
+                new_images.append(image)
+        else:
+            return image_processor(images, return_tensors="pt")["pixel_values"]
+        if all(x.shape == new_images[0].shape for x in new_images):
+            new_images = torch.stack(new_images, dim=0)
+        return new_images
+def preprocess_multimodal(sources: Sequence[str], data_args) -> dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+    for source in sources:
+        for sentence in source:
+            num_im = sentence["value"].count(DEFAULT_IMAGE_TOKEN)
+            if num_im == 1 or "<video>" in sentence["value"]:
+                # process only when the vision info is not multi-images
+                sentence["value"] = (
+                    sentence["value"]
+                    .replace(DEFAULT_IMAGE_TOKEN, "")
+                    .replace("<video>", "")
+                    .strip()
+                )
+                sentence["value"] = DEFAULT_IMAGE_TOKEN + "\n" + sentence["value"]
+                sentence["value"] = sentence["value"].strip()
+                if "mmtag" in conversation_lib.default_conversation.version:
+                    sentence["value"] = sentence["value"].replace(
+                        DEFAULT_IMAGE_TOKEN,
+                        "<Image>" + DEFAULT_IMAGE_TOKEN + "</Image>",
+                    )
+            replace_token = DEFAULT_IMAGE_TOKEN
+            if data_args.mm_use_im_start_end:
+                replace_token = (
+                    DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+                )
+            sentence["value"] = sentence["value"].replace(
+                DEFAULT_IMAGE_TOKEN, replace_token
+            )
+    return sources
+def preprocess_llama_2(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+) -> dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_image:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2
+    # Mask targets
+    sep = "[/INST] "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for rou in rounds:
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_v1(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+) -> dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_image:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
+    # Mask targets
+    sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+            if i != 0 and not tokenizer.legacy and IS_TOKENIZER_GREATER_THAN_0_14:
+                round_len -= 1
+                instruction_len -= 1
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def tokenizer_image_token(
+    prompt,
+    tokenizer,
+    image_token_index=IMAGE_TOKEN_INDEX,
+    return_tensors=None,
+):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if (
+        len(prompt_chunks) > 0
+        and len(prompt_chunks[0]) > 0
+        and prompt_chunks[0][0] == tokenizer.bos_token_id
+    ):
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == "pt":
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
+    return input_ids
+def tokenizer_image_token_llama3(
+    prompt,
+    tokenizer,
+    image_token_index=IMAGE_TOKEN_INDEX,
+    return_tensors=None,
+):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
+    input_ids = []
+    for x in insert_separator(prompt_chunks, [image_token_index]):
+        input_ids.extend(x)
+    if return_tensors is not None:
+        if return_tensors == "pt":
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
+    return input_ids
+def preprocess_qwen(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    system_message: str = "You are a helpful assistant.",
+) -> dict:
+    roles = {"human": "user", "gpt": "assistant"}
+    # Add image tokens to tokenizer as a special tokens
+    # Use a deepcopy of tokenizer so that we don't modify on the tokenizer
+    tokenizer = copy.deepcopy(tokenizer)
+    # When there is actually an image, we add the image tokens as a special token
+    if has_image:
+        tokenizer.add_tokens(["<image>"], special_tokens=True)
+    image_token_index = tokenizer.convert_tokens_to_ids("<image>")
+    im_start = tokenizer.convert_tokens_to_ids("<|im_start|>")
+    im_end = tokenizer.convert_tokens_to_ids("<|im_end|>")
+    unmask_tokens_idx = [198, im_start, im_end]
+    # nl_tokens = tokenizer("\n").input_ids
+    # Reset Qwen chat templates so that it won't include system message every time we apply
+    chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+    tokenizer.chat_template = chat_template
+    # _system = tokenizer("system").input_ids + nl_tokens
+    # _user = tokenizer("user").input_ids + nl_tokens
+    # _assistant = tokenizer("assistant").input_ids + nl_tokens
+    # Apply prompt templates
+    input_ids, targets = [], []
+    for source in sources:
+        if roles[source[0]["from"]] != roles["human"]:
+            source = source[1:]
+        input_id, target = [], []
+        # New version, use apply chat template
+        # Build system message for each sentence
+        input_id += tokenizer.apply_chat_template(
+            [{"role": "system", "content": system_message}]
+        )
+        target += [IGNORE_INDEX] * len(input_id)
+        for conv in source:
+            # Make sure llava data can load
+            try:
+                role = conv["role"]
+                content = conv["content"]
+            except:
+                role = conv["from"]
+                content = conv["value"]
+            role = roles.get(role, role)
+            conv = [{"role": role, "content": content}]
+            encode_id = tokenizer.apply_chat_template(conv)
+            input_id += encode_id
+            if role in ["user", "system"]:
+                target += [IGNORE_INDEX] * len(encode_id)
+            else:
+                target += encode_id
+        assert len(input_id) == len(target), f"{len(input_id)} != {len(target)}"
+        for idx, encode_id in enumerate(input_id):
+            if encode_id in unmask_tokens_idx:
+                target[idx] = encode_id
+            if encode_id == image_token_index:
+                input_id[idx] = IMAGE_TOKEN_INDEX
+        input_ids.append(input_id)
+        targets.append(target)
+    input_ids = torch.tensor(input_ids, dtype=torch.long)
+    targets = torch.tensor(targets, dtype=torch.long)
+    return dict(
+        input_ids=input_ids,  # tensor(bs x seq_len)
+        labels=targets,  # tensor(bs x seq_len)
+    )
+def preprocess_llama3(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    system_message: str = "You are a helpful assistant.",
+) -> dict:
+    # roles = {"human": "<|start_header_id|>user<|end_header_id|>", "gpt": "<|start_header_id|>assistant<|end_header_id|>"}
+    roles = {"human": "user", "gpt": "assistant"}
+    # Add image tokens to tokenizer as a special tokens
+    # Use a deepcopy of tokenizer so that we don't modify on the tokenizer
+    tokenizer = copy.deepcopy(tokenizer)
+    # When there is actually an image, we add the image tokens as a special token
+    if has_image:
+        tokenizer.add_tokens(["<image>"], special_tokens=True)
+    image_token_index = tokenizer.convert_tokens_to_ids("<image>")
+    bos_token_id = tokenizer.convert_tokens_to_ids("<|begin_of_text|>")
+    start_header_id = tokenizer.convert_tokens_to_ids("<|start_header_id|>")
+    end_header_id = tokenizer.convert_tokens_to_ids("<|end_header_id|>")
+    eot_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")
+    unmask_tokens = [
+        "<|begin_of_text|>",
+        "<|start_header_id|>",
+        "<|end_header_id|>",
+        "<|eot_id|>",
+        "\n\n",
+    ]
+    unmask_tokens_idx = [tokenizer.convert_tokens_to_ids(tok) for tok in unmask_tokens]
+    # After update, calling tokenizer of llama3 will
+    # auto add bos id for the tokens. ヽ(｀⌒´)ﾉ
+    def safe_tokenizer_llama3(text):
+        input_ids = tokenizer(text).input_ids
+        if input_ids[0] == bos_token_id:
+            input_ids = input_ids[1:]
+        return input_ids
+    nl_tokens = tokenizer.convert_tokens_to_ids("\n\n")
+    # chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{%- if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{%- endif %}"
+    chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
+    tokenizer.chat_template = chat_template
+    # Apply prompt templates
+    input_ids, targets = [], []
+    for source in sources:
+        if roles[source[0]["from"]] != roles["human"]:
+            source = source[1:]
+        input_id, target = [], []
+        # New version, use apply chat template
+        # Build system message for each sentence
+        input_id += tokenizer.apply_chat_template(
+            [{"role": "system", "content": system_message}]
+            # pyre-fixme[6]: For 1st argument expected `Union[int, str]` but got `slice`.
+        )[:-4]
+        target += [IGNORE_INDEX] * len(input_id)
+        for conv in source:
+            # Make sure llava data can load
+            try:
+                role = conv["role"]
+                content = conv["content"]
+            except:
+                role = conv["from"]
+                content = conv["value"]
+            role = roles.get(role, role)
+            conv = [{"role": role, "content": content}]
+            # First is bos token we don't need here
+            encode_id = tokenizer.apply_chat_template(conv)[1:-4]
+            input_id += encode_id
+            if role in ["user", "system"]:
+                target += [IGNORE_INDEX] * len(encode_id)
+            else:
+                target += encode_id
+        assert len(input_id) == len(target), f"{len(input_id)} != {len(target)}"
+        for idx, encode_id in enumerate(input_id):
+            if encode_id in unmask_tokens_idx:
+                target[idx] = encode_id
+            if encode_id == image_token_index:
+                input_id[idx] = IMAGE_TOKEN_INDEX
+        input_ids.append(input_id)
+        targets.append(target)
+    input_ids = torch.tensor(input_ids, dtype=torch.long)
+    targets = torch.tensor(targets, dtype=torch.long)
+    # print("input_ids", input_ids, flush=True)
+    # print("targets", targets, flush=True)
+    return dict(
+        input_ids=input_ids,  # tensor(bs x seq_len)
+        labels=targets,  # tensor(bs x seq_len)
+    )
+def preprocess_llama_3_1(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+) -> dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for source in sources:
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for sentence in source:
+            if sentence["from"] == "Answer":
+                sentence["from"] = "gpt"  # data bug
+            role = roles[sentence["from"]]
+            # assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_image:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    # remove the first bos token
+    if input_ids[0][0] == input_ids[0][1] == tokenizer.bos_token_id:
+        input_ids = input_ids[:, 1:]
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_3_1
+    # Mask targets
+    sep = "<|start_header_id|>" + conv.roles[1] + "<|end_header_id|>" + "\n\n"
+    # sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.shape[0])
+        rounds = conversation.split(conv.tokenizer.eos_token)
+        rounds = [rounds[0]] + [
+            rounds[idx] + rounds[idx + 1] for idx in range(1, len(rounds) - 1, 2)
+        ]
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2 and i != 0:
+                break
+            if i == 0:
+                round_len = len(tokenizer(rou, add_special_tokens=False).input_ids)
+                instruction_len = len(
+                    tokenizer(rou, add_special_tokens=False).input_ids
+                )
+            else:
+                parts[0] += sep
+                if has_image:
+                    round_len = len(tokenizer_image_token(rou, tokenizer)) + 1
+                    instruction_len = len(tokenizer_image_token(parts[0], tokenizer))
+                else:
+                    round_len = len(tokenizer(rou).input_ids) + 1
+                    instruction_len = len(tokenizer(parts[0]).input_ids)
+            # if i > 0: round_len += 1
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        cur_len = cur_len + len(tokenizer(sep, add_special_tokens=False).input_ids)
+        # if cur_len > tokenizer.model_max_length: print(f"WARNING: max length context")
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_llama_3_2(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+) -> dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_image:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    # remove the first bos token
+    if input_ids[0][0] == input_ids[0][1] == tokenizer.bos_token_id:
+        input_ids = input_ids[:, 1:]
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_3_2
+    # Mask targets
+    sep = "<|start_header_id|>" + conv.roles[1] + "<|end_header_id|>" + "\n\n"
+    # sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.shape[0])
+        rounds = conversation.split(conv.tokenizer.eos_token)
+        rounds = [rounds[0]] + [
+            rounds[idx] + rounds[idx + 1] for idx in range(1, len(rounds) - 1, 2)
+        ]
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2 and i != 0:
+                break
+            if i == 0:
+                round_len = len(tokenizer(rou, add_special_tokens=False).input_ids)
+                instruction_len = len(
+                    tokenizer(rou, add_special_tokens=False).input_ids
+                )
+            else:
+                parts[0] += sep
+                if has_image:
+                    round_len = len(tokenizer_image_token(rou, tokenizer)) + 1
+                    instruction_len = len(tokenizer_image_token(parts[0], tokenizer))
+                else:
+                    round_len = len(tokenizer(rou).input_ids) + 1
+                    instruction_len = len(tokenizer(parts[0]).input_ids)
+            # if i > 0: round_len += 1
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        cur_len = cur_len + len(tokenizer(sep, add_special_tokens=False).input_ids)
+        # if cur_len > tokenizer.model_max_length: print(f"WARNING: max length context")
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_phi3(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+) -> dict:
+    conv = conversation_lib.conv_templates["phi3"].copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_image:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MPT
+    # Mask targets
+    sep = conv.sep + conv.roles[1]
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep)
+        re_rounds = [conv.sep.join(rounds[:3])]  # system + user + gpt
+        for conv_idx in range(3, len(rounds), 2):
+            re_rounds.append(
+                conv.sep.join(rounds[conv_idx : conv_idx + 2])
+            )  # user + gpt
+        cur_len = 0
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(re_rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 1
+            if i == 0:
+                round_len += 1
+                instruction_len += 1
+            else:
+                round_len -= 2
+                instruction_len -= 2
+            if (
+                i != 0
+                and getattr(tokenizer, "legacy", False)
+                and IS_TOKENIZER_GREATER_THAN_0_14
+            ):
+                round_len += 1
+                instruction_len += 1
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_mpt(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+) -> dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_image:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MPT
+    # Mask targets
+    sep = conv.sep + conv.roles[1]
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep)
+        re_rounds = [conv.sep.join(rounds[:3])]  # system + user + gpt
+        for conv_idx in range(3, len(rounds), 2):
+            re_rounds.append(
+                conv.sep.join(rounds[conv_idx : conv_idx + 2])
+            )  # user + gpt
+        cur_len = 0
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(re_rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 1
+            if (
+                i != 0
+                and getattr(tokenizer, "legacy", False)
+                and IS_TOKENIZER_GREATER_THAN_0_14
+            ):
+                round_len += 1
+                instruction_len += 1
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> dict:
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        assert DEFAULT_IMAGE_TOKEN in source[0]["value"]
+        source[0]["value"] = DEFAULT_IMAGE_TOKEN
+        conversation = (
+            source[0]["value"]
+            + source[1]["value"]
+            + conversation_lib.default_conversation.sep
+        )
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [
+        tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+        for prompt in conversations
+    ]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_image_token(source[0]["value"], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+    return dict(input_ids=input_ids, labels=targets)
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+) -> dict:
+    """
+    Given a list of sources, each is a conversation list. This transform:
+    1. Add signal '### ' at the beginning each sentence, with end signal '\n';
+    2. Concatenate conversations together;
+    3. Tokenize the concatenated conversation;
+    4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
+    """
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
+        return preprocess_plain(sources, tokenizer)
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_2:
+        return preprocess_llama_2(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version.startswith("v1"):
+        return preprocess_v1(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == "mpt":
+        return preprocess_mpt(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == "phi3":
+        return preprocess_phi3(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == "qwen":
+        return preprocess_qwen(sources, tokenizer, has_image=has_image)
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        header = f"{conversation_lib.default_conversation.system}\n\n"
+        conversation = _add_speaker_and_signal(header, source)
+        conversations.append(conversation)
+    # tokenize conversations
+    def get_tokenize_len(prompts):
+        return [len(tokenizer_image_token(prompt, tokenizer)) for prompt in prompts]
+    if has_image:
+        input_ids = [
+            tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+            for prompt in conversations
+        ]
+    else:
+        conversations_tokenized = _tokenize_fn(conversations, tokenizer)
+        input_ids = conversations_tokenized["input_ids"]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        if has_image:
+            tokenized_lens = get_tokenize_len([header] + [s["value"] for s in source])
+        else:
+            tokenized_lens = _tokenize_fn(
+                [header] + [s["value"] for s in source],
+                tokenizer,
+            )["input_ids_lens"]
+        speakers = [sentence["from"] for sentence in source]
+        _mask_targets(target, tokenized_lens, speakers)
+    return dict(input_ids=input_ids, labels=targets)

tempo/multimodal_encoder/__pycache__/base_encoder.cpython-312.pyc ADDED Viewed

Binary file (6.52 kB). View file

tempo/multimodal_encoder/__pycache__/builder.cpython-312.pyc ADDED Viewed

Binary file (1.28 kB). View file

tempo/multimodal_encoder/__pycache__/qwen3vl_encoder.cpython-312.pyc ADDED Viewed

Binary file (14.4 kB). View file

tempo/multimodal_encoder/__pycache__/siglip_encoder.cpython-312.pyc ADDED Viewed

Binary file (4.29 kB). View file

tempo/multimodal_encoder/base_encoder.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from abc import ABC, abstractmethod
+import torch
+import torch.nn as nn
+class ProcessorWrapper:
+    def __init__(
+        self,
+        transform,
+        height=378,
+        width=378,
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+    ):
+        self._crop_size = {
+            "height": height,
+            "width": width,
+        }
+        self._transforms = transform
+        # print(transform)
+        self.image_mean = image_mean
+    @property
+    def crop_size(self):
+        return self._crop_size
+    def preprocess(self, image, return_tensors="pt"):
+        # Ensure image is a PIL Image
+        output = {}
+        output["pixel_values"] = [self._transforms(image)]
+        return output
+class BaseVisionTower(nn.Module):
+    def __init__(self, vision_tower_name, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.args = args
+        self.vision_tower_name = vision_tower_name
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
+        self.unfreeze_mm_vision_tower = getattr(args, "unfreeze_mm_vision_tower", False)
+        self.delay_load = delay_load
+    @abstractmethod
+    def load_model(self, device_map=None):
+        raise NotImplementedError("Subclasses must implement load_model")
+    @abstractmethod
+    def _forward(self, images):
+        raise NotImplementedError("Subclasses must implement forward")
+    def forward(self, images):
+        if type(images) is list:
+            image_features = [self._forward(image.unsqueeze(0)) for image in images]
+        else:
+            image_features = self._forward(images)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        # Dynamically infer the dtype from the first parameter, if not explicitly specified
+        if hasattr(self.vision_tower, "dtype"):
+            return self.vision_tower.dtype
+        else:
+            params = list(self.vision_tower.parameters())
+            return (
+                params[0].dtype if len(params) > 0 else torch.float32
+            )  # Default to torch.float32 if no parameters
+    @property
+    def device(self):
+        # Dynamically infer the device from the first parameter, if not explicitly specified
+        if hasattr(self.vision_tower, "device"):
+            return self.vision_tower.device
+        else:
+            params = list(self.vision_tower.parameters())
+            return (
+                params[0].device if len(params) > 0 else torch.device("cpu")
+            )  # Default to CPU if no parameters
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        try:
+            return self.config.hidden_size
+        except:
+            return self._hidden_size
+    @property
+    def image_size(self):  # resolution
+        # return self.config.image_size
+        try:
+            return self.config.image_size
+        except:
+            return self._image_size
+    @property
+    def patch_size(self):
+        # return self.config.patch_size
+        try:
+            return self.config.patch_size
+        except:
+            return self._patch_size
+    @property
+    def num_patches_per_side(self):
+        if self._interp_size is not None:
+            return int(self._interp_size**0.5)
+        try:
+            return self.image_size // self.patch_size
+        except:
+            return self._num_patches_per_side
+    @property
+    def num_patches(self):
+        if self._interp_size is not None:
+            return self._interp_size
+        try:
+            return self.num_patches_per_side**2
+        except:
+            return self._num_patches

tempo/multimodal_encoder/builder.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import copy
+from pathlib import Path
+from .qwen3vl_encoder import Qwen3VLTower
+from .siglip_encoder import SiglipVisionTower
+def build_vision_tower_aux_list(vision_tower_cfg, **kwargs):
+    vision_tower_aux_name_list = getattr(vision_tower_cfg, "mm_vision_tower_aux_list", ["Qwen/Qwen3-VL-2B-Instruct"])
+    vision_tower_aux_list = []
+    for vision_tower_aux_name in vision_tower_aux_name_list:
+        config = copy.deepcopy(vision_tower_cfg)
+        vision_tower_basename = Path(vision_tower_aux_name).name.lower()
+        if "siglip" in vision_tower_basename:
+            vision_tower_aux_list.append(SiglipVisionTower(vision_tower_aux_name, args=config, **kwargs))
+        elif "qwen3-vl" in vision_tower_basename:
+            vision_tower_aux_list.append(Qwen3VLTower(vision_tower_aux_name, args=config, **kwargs))
+        else:
+            raise ValueError(f"Unknown vision tower: {vision_tower_basename}")
+    return vision_tower_aux_list

tempo/multimodal_encoder/qwen3vl_encoder.py ADDED Viewed

	@@ -0,0 +1,336 @@

+import gc
+import random
+random.seed(42)
+import torch
+import torch.nn as nn
+from accelerate import init_empty_weights
+from transformers.utils import is_torchdynamo_compiling
+from transformers import AutoConfig, Qwen3VLForConditionalGeneration, Qwen3VLProcessor
+class Qwen3VLTower(nn.Module):
+    def __init__(self, vision_tower_aux_name, args, **kwargs):
+        super(Qwen3VLTower, self).__init__()
+        self.is_loaded = True # for compatibility
+        self.model_path = vision_tower_aux_name
+        self.dynamic_compress = getattr(args, "dynamic_compress", False)
+        # load processor
+        self.image_processor = Qwen3VLProcessor.from_pretrained(self.model_path)
+        # load config
+        self.config = AutoConfig.from_pretrained(self.model_path)
+        self.config._attn_implementation = "flash_attention_2"
+        self.config.dtype = torch.bfloat16
+        # load model
+        with init_empty_weights():
+            self.vlm = Qwen3VLForConditionalGeneration(self.config)
+            if hasattr(self.vlm, "lm_head"):
+                del self.vlm.lm_head
+        self.vlm.requires_grad_(False)
+        self.hidden_size = self.config.text_config.hidden_size
+        self.num_compression_tokens = args.num_compression_tokens
+        self.compression_tokens = nn.Parameter(
+            torch.empty(1, self.num_compression_tokens, self.hidden_size)
+        )
+    def smart_init_vision_tower(self):
+        """Load only during Stage 0"""
+        temp_model = Qwen3VLForConditionalGeneration.from_pretrained(
+            self.model_path,
+            dtype=self.vlm.dtype,
+            device_map="cpu",  # avoid multiple nodes and gpu conflicit
+        )
+        missing_keys, unexpected_keys = self.vlm.load_state_dict(temp_model.state_dict(), strict=False)
+        if len(missing_keys) > 0:
+            print(f"[Warning] Missing keys in Qwen3-VL loading: {missing_keys}")
+        if len(unexpected_keys) > 0:
+            print(f"[Warning] Unexpected keys keys in Qwen3-VL loading: {unexpected_keys}")
+        del temp_model
+        gc.collect()
+        torch.cuda.empty_cache()
+        self.vlm.requires_grad_(False)
+        with torch.no_grad():
+            embed_weights = self.vlm.model.language_model.embed_tokens.weight
+            mean = embed_weights.mean(dim=0)
+            std = embed_weights.std(dim=0)
+        self.compression_tokens.data = torch.normal(mean=mean.repeat(self.num_compression_tokens, 1), std=std.repeat(self.num_compression_tokens, 1)).unsqueeze(0)
+        print(f"[Smart Init] Done. Shape: {self.compression_tokens.shape}")
+    def smart_init_dynamic_compress(self):
+        if getattr(self, "vlm_head", None) is None:
+            temp_model = Qwen3VLForConditionalGeneration.from_pretrained(
+                self.model_path,
+                dtype=self.vlm.dtype,
+                device_map="cpu",  # avoid multiple nodes and gpu conflicit
+            )
+            self.vlm_head = temp_model.lm_head
+            del temp_model
+            gc.collect()
+            torch.cuda.empty_cache()
+        token_true_id = self.image_processor.tokenizer.get_vocab()["Yes"]
+        token_false_id = self.image_processor.tokenizer.get_vocab()["No"]
+        lm_head_weights = self.vlm_head.weight.data
+        weight_yes = lm_head_weights[token_true_id]
+        weight_no = lm_head_weights[token_false_id]
+        D = weight_yes.size()[0]
+        self.linear_layer = nn.Linear(D, 1, bias=False)
+        with torch.no_grad():
+            self.linear_layer.weight[0] = weight_yes - weight_no
+        del self.vlm_head
+        self.linear_layer.to("cuda")
+        print(f"[Smart Init Router] Done!")
+    def compute_relevance(self, mask_compression_token, batch_size, last_hidden_state):
+        first_compression_idx = mask_compression_token.int().argmax(
+            dim=1
+        )  # (batch_size,)
+        prev_idx = first_compression_idx - 1  # (batch_size,)
+        batch_indices = torch.arange(batch_size, device=last_hidden_state.device)
+        prev_token_features = last_hidden_state[
+            batch_indices, prev_idx
+        ]  # (batch_size, hidden_dim)
+        scores = self.linear_layer(prev_token_features.float())
+        scores = torch.sigmoid(scores).squeeze(-1).cpu().detach().tolist()
+        return scores
+    def load_model(self):
+        # for compatible with other encoder
+        pass
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        pixel_values_videos=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        cache_position=None,
+        **kwargs,
+    ):
+        if self.dynamic_compress and not hasattr(self, "linear_layer"):
+            self.smart_init_dynamic_compress()
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You must specify exactly one of input_ids or inputs_embeds"
+            )
+        used_compression_tokens = self.num_compression_tokens
+        if inputs_embeds is None:
+            # process input_ids to insert learnable token
+            batch_size, n_seq = input_ids.shape
+            valid_lengths = (
+                input_ids != self.image_processor.tokenizer.pad_token_id
+            ).sum(dim=1)
+            input_ids = torch.cat(
+                [
+                    input_ids,
+                    torch.full(
+                        (batch_size, used_compression_tokens),
+                        # self.config.pad_token_id,
+                        self.image_processor.tokenizer.pad_token_id,
+                        dtype=input_ids.dtype,
+                        device=input_ids.device,
+                    ),
+                ],
+                dim=1,
+            )
+            attention_mask = torch.cat(
+                [
+                    attention_mask,
+                    torch.zeros(
+                        (batch_size, used_compression_tokens),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    ),
+                ],
+                dim=1,
+            )
+            inputs_embeds = self.vlm.get_input_embeddings()(input_ids)
+        else:
+            raise NotImplementedError(
+                "Current only support input_ids as vlm compressor inputs"
+            )
+        image_mask = None
+        video_mask = None
+        if pixel_values is not None:
+            image_embeds, deepstack_image_embeds = self.vlm.get_image_features(
+                pixel_values, image_grid_thw
+            )
+            image_embeds = torch.cat(image_embeds, dim=0).to(
+                inputs_embeds.device, inputs_embeds.dtype
+            )
+            image_mask, _ = self.vlm.model.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+        if pixel_values_videos is not None:
+            video_embeds, deepstack_video_embeds = self.vlm.get_video_features(
+                pixel_values_videos, video_grid_thw
+            )
+            video_embeds = torch.cat(video_embeds, dim=0).to(
+                inputs_embeds.device, inputs_embeds.dtype
+            )
+            _, video_mask = self.vlm.model.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+        visual_pos_masks = None
+        deepstack_visual_embeds = None
+        if image_mask is not None and video_mask is not None:
+            # aggregate visual_pos_masks and deepstack_visual_embeds
+            image_mask = image_mask[..., 0]
+            video_mask = video_mask[..., 0]
+            visual_pos_masks = image_mask | video_mask
+            deepstack_visual_embeds = []
+            image_mask_joint = image_mask[visual_pos_masks]
+            video_mask_joint = video_mask[visual_pos_masks]
+            for img_embed, vid_embed in zip(
+                deepstack_image_embeds, deepstack_video_embeds
+            ):
+                embed_joint = img_embed.new_zeros(
+                    visual_pos_masks.sum(), img_embed.shape[-1]
+                ).to(img_embed.device)
+                embed_joint[image_mask_joint, :] = img_embed
+                embed_joint[video_mask_joint, :] = vid_embed
+                deepstack_visual_embeds.append(embed_joint)
+        elif image_mask is not None:
+            image_mask = image_mask[..., 0]
+            visual_pos_masks = image_mask
+            deepstack_visual_embeds = deepstack_image_embeds
+        elif video_mask is not None:
+            video_mask = video_mask[..., 0]
+            visual_pos_masks = video_mask
+            deepstack_visual_embeds = deepstack_video_embeds
+        # ------------------------------------------------------------------
+        # inputs_embeds, [Text + Image + Video]，shape: (B, L, D)
+        # concat Learnable Tokens
+        position_compression_token = (
+            torch.arange(n_seq + used_compression_tokens, device=input_ids.device)
+            .unsqueeze(0)
+            .expand(batch_size, -1)
+        )
+        mask_compression_token = (
+            position_compression_token >= valid_lengths.unsqueeze(1)
+        ) & (
+            position_compression_token
+            < (valid_lengths + used_compression_tokens).unsqueeze(1)
+        )
+        compression_tokens_expanded = self.compression_tokens[
+            :, :used_compression_tokens, :
+        ].expand(batch_size, -1, -1)
+        inputs_embeds[mask_compression_token] = compression_tokens_expanded.reshape(
+            -1, self.hidden_size
+        ).to(inputs_embeds.dtype)
+        attention_mask.masked_fill_(mask_compression_token, 1)
+        # ------------------------------------------------------------------
+        if position_ids is None:
+            attention_mask_tensor = (
+                attention_mask
+                if not isinstance(attention_mask, dict)
+                else attention_mask["full_attention"]
+            )
+            if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4:
+                attention_mask_tensor = torch.diagonal(
+                    attention_mask_tensor[:, 0], dim1=1, dim2=2
+                )
+                # Only apply conversion for floating point tensors (inverted masks)
+                if attention_mask_tensor.dtype.is_floating_point:
+                    attention_mask_tensor = (
+                        attention_mask_tensor
+                        / torch.finfo(attention_mask_tensor.dtype).min
+                    )
+                    attention_mask_tensor = (1.0 - attention_mask_tensor).int()
+            # Calculate RoPE index once per generation in the pre-fill stage only.
+            # When compiling, we can't check tensor values thus we check only input length
+            # It is safe to assume that `length!=1` means we're in pre-fill because compiled
+            # models currently cannot do asssisted decoding
+            prefill_compiled_stage = is_torchdynamo_compiling() and (
+                (input_ids is not None and input_ids.shape[1] != 1)
+                or (inputs_embeds is not None and inputs_embeds.shape[1] != 1)
+            )
+            prefill_noncompiled_stage = not is_torchdynamo_compiling() and (
+                (cache_position is not None and cache_position[0] == 0)
+                or (past_key_values is None or past_key_values.get_seq_length() == 0)
+            )
+            if (
+                prefill_compiled_stage or prefill_noncompiled_stage
+            ) or self.rope_deltas is None:
+                position_ids, rope_deltas = self.vlm.model.get_rope_index(
+                    input_ids,
+                    image_grid_thw,
+                    video_grid_thw,
+                    attention_mask=attention_mask_tensor,
+                )
+                self.rope_deltas = rope_deltas
+            # then use the prev pre-calculated rope-deltas to get the correct position ids
+            else:
+                batch_size, seq_length, _ = inputs_embeds.shape
+                delta = (
+                    (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
+                    if cache_position is not None
+                    else 0
+                )
+                position_ids = torch.arange(seq_length, device=inputs_embeds.device)
+                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+                if cache_position is not None:  # otherwise `deltas` is an int `0`
+                    delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+        outputs = self.vlm.model.language_model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            visual_pos_masks=visual_pos_masks,
+            deepstack_visual_embeds=deepstack_visual_embeds,
+            **kwargs,
+        )
+        last_hidden_state = outputs.last_hidden_state
+        compression_features_flat = last_hidden_state[mask_compression_token]
+        compression_features = compression_features_flat.reshape(
+            batch_size, used_compression_tokens, -1
+        )
+        relevance_scores = None
+        if self.dynamic_compress:
+            relevance_scores = self.compute_relevance(mask_compression_token, batch_size, last_hidden_state)
+        return compression_features, relevance_scores

tempo/multimodal_encoder/siglip_encoder.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+import torch.nn.functional as F
+from transformers import SiglipImageProcessor, SiglipVisionModel
+from .base_encoder import BaseVisionTower
+class SiglipVisionTower(BaseVisionTower):
+    def __init__(self, vision_tower_name, args, delay_load=False):
+        super(SiglipVisionTower, self).__init__(vision_tower_name, args, delay_load)
+        model_path, res, interp = vision_tower_name, 384, 576
+        self.vision_tower_name = model_path
+        self._image_size = res if res is not None else 512
+        self._interp_size = interp
+        if not self.delay_load:
+            self.load_model()
+        elif self.unfreeze_mm_vision_tower:
+            self.load_model()
+        else:
+            self._hidden_size = 1152
+    def load_model(self, device_map=None):
+        self.vision_model = "siglip"
+        self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name)
+        # self.vision_tower = clip_model.visual.trunk
+        self.vision_tower.output_tokens = True
+        self._hidden_size = self.vision_tower.config.hidden_size
+        self._image_size = self.vision_tower.config.image_size
+        self._patch_size = self.vision_tower.config.patch_size
+        self.image_processor = SiglipImageProcessor.from_pretrained(
+            self.vision_tower_name
+        )
+        self.vision_tower.requires_grad_(self.unfreeze_mm_vision_tower)
+        self.is_loaded = True
+    def interpolate(self, image_features):
+        if self._interp_size is None:
+            return image_features
+        b, num_tokens, dim = image_features.shape
+        if num_tokens != self.num_patches:
+            target_h = target_w = int(self._interp_size**0.5)
+            h = w = int(num_tokens**0.5)
+            image_features = image_features.view(b, h, w, dim)
+            image_features = image_features.permute(0, 3, 1, 2).contiguous()
+            image_features = F.interpolate(
+                image_features.to(torch.float32),
+                size=(target_h, target_w),
+                mode="bilinear",
+                align_corners=False,
+            ).to(image_features.dtype)
+            # Permute the dimensions back to (b, target_h, target_w, dim)
+            image_features = image_features.permute(0, 2, 3, 1).contiguous()
+            # Flatten the spatial dimensions (target_h, target_w) into a single dimension
+            image_features = image_features.flatten(1, 2)
+        return image_features
+    def _forward(self, images, interpolate_token=576):
+        with torch.set_grad_enabled(self.unfreeze_mm_vision_tower):
+            embeddings = self.vision_tower.vision_model.embeddings(images)
+            encoder_outputs = self.vision_tower.vision_model.encoder(
+                inputs_embeds=embeddings
+            )
+            image_features = encoder_outputs.last_hidden_state
+            interp_features = self.interpolate(image_features)
+            return interp_features

tempo/multimodal_projector/__pycache__/builder.cpython-312.pyc ADDED Viewed

Binary file (3.18 kB). View file

tempo/multimodal_projector/builder.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import re
+import torch.nn as nn
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": "identity"}
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels),
+            nn.GELU(),
+            nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+def build_vision_projector(config):
+    projector_type = getattr(config, "mm_projector_type", "linear")
+    if projector_type == "linear":
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+    mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+    if projector_type == "identity":
+        return IdentityMap()
+    raise ValueError(f"Unknown projector type: {projector_type}")

tempo/tempo_arch.py ADDED Viewed

	@@ -0,0 +1,464 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from abc import ABC, abstractmethod
+import torch
+import torch.nn as nn
+from tempo.constants import (
+    DEFAULT_IM_END_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IMAGE_PATCH_TOKEN,
+    IGNORE_INDEX,
+    IMAGE_TOKEN_INDEX,
+)
+from tempo.multimodal_encoder.builder import build_vision_tower_aux_list
+from tempo.multimodal_projector.builder import build_vision_projector
+from tempo.vlm_multimodal_processor import VLMMultimodalProcessor
+class TempoMetaModel:
+    def __init__(self, config):
+        super(TempoMetaModel, self).__init__(config)
+        if hasattr(config, "mm_vision_tower_aux_list"):
+            self.vision_tower_aux_list = nn.ModuleList(
+                build_vision_tower_aux_list(config, delay_load=True)
+            )
+            config.mm_hidden_size = sum(
+                [
+                    vision_tower_aux.hidden_size for vision_tower_aux in self.vision_tower_aux_list
+                ]
+            )
+            self.mm_projector = build_vision_projector(config)
+        else:
+            raise NotImplementedError(
+                "mm_vision_tower_aux_list is not found in config. Please initialize vision modules in the subclass of TempoMetaModel."
+            )
+    def get_vision_tower_aux_list(self):
+        vision_tower_aux_list = getattr(self, "vision_tower_aux_list", None)
+        return vision_tower_aux_list
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        # vision_hidden_size = model_args.vision_hidden_size
+        vision_tower_aux_list = model_args.vision_tower_aux_list
+        # vision_tower_aux_token_len_list = model_args.vision_tower_aux_token_len_list
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+        self.config.mm_vision_tower_aux_list = vision_tower_aux_list
+        # self.config.mm_vision_tower_aux_token_len_list = vision_tower_aux_token_len_list
+        if self.get_vision_tower_aux_list() is None:
+            vision_tower_aux_list = build_vision_tower_aux_list(model_args)
+            if model_args.unfreeze_mm_vision_tower:
+                self.vision_tower_aux_list = nn.ModuleList(vision_tower_aux_list)
+            else:
+                self.vision_tower_aux_list = vision_tower_aux_list
+        else:
+            vision_tower_aux_list = self.vision_tower_aux_list
+            for vision_tower_aux in vision_tower_aux_list:
+                vision_tower_aux.load_model()
+            if model_args.unfreeze_mm_vision_tower and not isinstance(self.vision_tower_aux_list, nn.ModuleList):
+                self.vision_tower_aux_list = nn.ModuleList(self.vision_tower_aux_list)
+        self.config.mm_projector_type = getattr(model_args, "mm_projector_type", "linear")
+        # self.config.vision_hidden_size = vision_hidden_size
+        if getattr(self, "mm_projector", None) is None:
+            self.config.mm_hidden_size = sum(
+                [
+                    vision_tower_aux.hidden_size for vision_tower_aux in vision_tower_aux_list
+                ]
+            )
+            self.mm_projector = build_vision_projector(self.config)
+        else:
+            for p in self.mm_projector.parameters():
+                p.requires_grad = True
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location="cpu")
+            def get_w(weights, keyword):
+                return {
+                    k.split(keyword + ".")[1]: v
+                    for k, v in weights.items()
+                    if keyword + "." in k
+                }
+            self.mm_projector.load_state_dict(
+                get_w(mm_projector_weights, "mm_projector"), strict=True
+            )
+class TempoMetaForCausalLM(ABC):
+    @abstractmethod
+    def get_model(self):
+        pass
+    def get_vision_tower_aux_list(self):
+        return self.get_model().get_vision_tower_aux_list()
+    def prepare_inputs_labels_for_multimodal(
+        self,
+        input_ids,
+        position_ids,
+        attention_mask,
+        past_key_values,
+        labels,
+        images=None,
+        image_sizes=None,
+        vlm_inputs=None,
+        seg_timestamps=None,
+        batch_split_size=None,
+        relevance=None,
+    ):
+        if input_ids.shape[1] == 1: # inference
+            return (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                None,
+                labels,
+            )
+        is_video = "pixel_values_videos" in vlm_inputs
+        compressed_features, relevance_scores = VLMMultimodalProcessor.tokenize_vision_inputs(self.get_vision_tower_aux_list()[0], vlm_inputs, is_video)
+        compressed_features, count_allocations = (
+            VLMMultimodalProcessor.adaptive_linear_budget_allocation(
+                compressed_features,
+                relevance_scores,
+                is_video,
+                max_budget=self.config.visual_token_budget if hasattr(self.config, "visual_token_budget") else 8192,
+                min_tokens=4,
+                strategy="head",
+            )
+        )
+        # for visulization of the allocation results, can be removed
+        self._demo_count_allocations = count_allocations
+        if isinstance(compressed_features, list):
+            seg_lens = [feat.shape[0] for feat in compressed_features]
+            compressed_features = torch.cat(compressed_features, dim=0)
+            image_features = self.get_model().mm_projector(compressed_features)
+            print(f"[Total Segments: {len(seg_lens)}], compression features after dynamic compress", image_features.shape)
+            image_features = list(torch.split(image_features, seg_lens, dim=0))
+        else:
+            image_features = self.get_model().mm_projector(compressed_features)
+            print("final compression features for the whole batch:", image_features.shape)
+        if is_video:
+            # add timestamp embeddings for video inputs
+            if batch_split_size is not None and len(batch_split_size) > 1: # batch training
+                print(f"Number of segments for each video is: {batch_split_size}")
+                start_idx = 0
+                final_image_features_list = []
+                for b_split_size in batch_split_size:
+                    current_image_features = image_features[start_idx : start_idx + b_split_size]
+                    current_seg_timestamp = seg_timestamps[start_idx : start_idx + b_split_size]
+                    final_image_features_list.append(
+                        VLMMultimodalProcessor.add_seg_timestamp(
+                            current_image_features,
+                            self.get_model(),
+                            current_seg_timestamp,
+                            is_video,
+                        )
+                    )
+                    start_idx += b_split_size
+            else:
+                final_image_features_list = [
+                    VLMMultimodalProcessor.add_seg_timestamp(
+                        image_features, self.get_model(), seg_timestamps, is_video
+                    )
+                ]
+        else:
+            final_image_features_list = [img for img in image_features]
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(
+                0, input_ids.shape[1], dtype=torch.long, device=input_ids.device
+            )
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+        attention_mask = attention_mask | (input_ids == IMAGE_TOKEN_INDEX)
+        input_ids = [
+            cur_input_ids[cur_attention_mask]
+            for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
+        ]
+        labels = [
+            cur_labels[cur_attention_mask]
+            for cur_labels, cur_attention_mask in zip(labels, attention_mask)
+        ]
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            if num_images == 0:
+                cur_image_features = final_image_features_list[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = torch.cat(
+                    [cur_input_embeds_1, cur_image_features[0:0]], dim=0
+                )
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+            image_token_indices = (
+                [-1]
+                + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist()
+                + [cur_input_ids.shape[0]]
+            )
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(
+                    cur_input_ids[
+                        image_token_indices[i] + 1 : image_token_indices[i + 1]
+                    ]
+                )
+                cur_labels_noim.append(
+                    cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]]
+                )
+            split_sizes_text = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = self.get_model().embed_tokens(
+                torch.cat(cur_input_ids_noim)
+            )
+            cur_input_embeds_no_im = torch.split(
+                cur_input_embeds, split_sizes_text, dim=0
+            )
+            # for multi-image inputs, there is a bug.
+            cur_new_input_embeds = []
+            cur_new_labels = []
+            text_len = sum([x.shape[0] for x in cur_input_embeds_no_im])
+            visual_len = len(final_image_features_list[cur_image_idx])
+            max_visual_len = (
+                self.get_model().config.tokenizer_model_max_length
+                - getattr(self.get_model().config, "inference_max_length", 16)
+                - text_len
+            )
+            if max_visual_len < visual_len:
+                final_image_features_list[cur_image_idx] = final_image_features_list[cur_image_idx][:max_visual_len]
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    try:
+                        cur_image_features = final_image_features_list[cur_image_idx]
+                    except IndexError:
+                        print(f"cur_image_idx={cur_image_idx} is not ok, get {num_images} images!!!")
+                        cur_image_features = final_image_features_list[cur_image_idx - 1]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(
+                        torch.full(
+                            (cur_image_features.shape[0],),
+                            IGNORE_INDEX,
+                            device=cur_labels.device,
+                            dtype=cur_labels.dtype,
+                        )
+                    )
+            cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+        tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None)
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full(
+            (batch_size, max_len),
+            IGNORE_INDEX,
+            dtype=new_labels[0].dtype,
+            device=new_labels[0].device,
+        )
+        attention_mask = torch.zeros(
+            (batch_size, max_len),
+            dtype=attention_mask.dtype,
+            device=attention_mask.device,
+        )
+        position_ids = torch.zeros(
+            (batch_size, max_len),
+            dtype=position_ids.dtype,
+            device=position_ids.device,
+        )
+        for i, (cur_new_embed, cur_new_labels) in enumerate(
+            zip(new_input_embeds, new_labels)
+        ):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, "tokenizer_padding_side", "right") == "left":
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                            cur_new_embed,
+                        ),
+                        dim=0,
+                    )
+                )
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(
+                        0,
+                        cur_len,
+                        dtype=position_ids.dtype,
+                        device=position_ids.device,
+                    )
+            else:
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            cur_new_embed,
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                        ),
+                        dim=0,
+                    )
+                )
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(
+                        0,
+                        cur_len,
+                        dtype=position_ids.dtype,
+                        device=position_ids.device,
+                    )
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+        if _position_ids is None:
+            position_ids = None
+        return (
+            None,
+            position_ids,
+            attention_mask,
+            past_key_values,
+            new_input_embeds,
+            new_labels,
+        )
+    def initialize_vision_tokenizer(self, model_args, tokenizer):
+        if model_args.mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+        if model_args.mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens(
+                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+            )
+            self.resize_token_embeddings(len(tokenizer))
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True
+                )
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True
+                )
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+            if model_args.pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(
+                    model_args.pretrain_mm_mlp_adapter, map_location="cpu"
+                )
+                embed_tokens_weight = mm_projector_weights["model.embed_tokens.weight"]
+                assert num_new_tokens == 2
+                if input_embeddings.shape == embed_tokens_weight.shape:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[
+                        -num_new_tokens:
+                    ]
+                elif embed_tokens_weight.shape[0] == num_new_tokens:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                else:
+                    raise ValueError(
+                        f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}."
+                    )
+        elif model_args.mm_use_im_patch_token:
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = False
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False

tempo/vlm_multimodal_processor.py ADDED Viewed

	@@ -0,0 +1,332 @@

+import random
+import numpy as np
+import torch
+class VLMMultimodalProcessor:
+    """SVLM-based vision compression."""
+    @staticmethod
+    def tokenize_vision_inputs(vision_language_model, vlm_inputs, is_video, chunk_size=4):
+        return vision_language_model(**vlm_inputs)
+    @staticmethod
+    def add_seg_timestamp(vision_features, model, seg_timestamps, is_video):
+        if not is_video:
+            return vision_features
+        device = vision_features[0].device
+        dtype = vision_features[0].dtype
+        max_len = max(len(ts) for ts in seg_timestamps)
+        num_segments = len(seg_timestamps)
+        # pad_token_id = getattr(model.config, "pad_token_id", 151643)
+        pad_token_id = 151643
+        timestamp_ids_tensor = torch.full(
+            (num_segments, max_len), pad_token_id, dtype=torch.long, device=device
+        )
+        for i, ts in enumerate(seg_timestamps):
+            length = len(ts)
+            timestamp_ids_tensor[i, :length] = torch.tensor(ts, device=device)
+        timestamp_embeds = model.get_input_embeddings()(timestamp_ids_tensor).to(dtype)
+        final_vision_features = []
+        for i in range(num_segments):
+            if vision_features[i].shape[0] == 0:
+                print("drop this segment directly.")
+                continue
+            final_vision_features.append(
+                torch.cat(
+                    [
+                        timestamp_embeds[i][: len(seg_timestamps[i])],
+                        vision_features[i],
+                    ],
+                    dim=0,
+                )
+            )
+        # return torch.cat(final_vision_features, dim=0).unsqueeze(0)
+        return torch.cat(final_vision_features, dim=0)  # (comp_frame1+comp_frame2+comp_frame3+..., d)
+    @staticmethod
+    def tome_merge(x: torch.Tensor, target_num: int) -> torch.Tensor:
+        """
+        Token Merging using bipartite soft matching.
+        Reference: "Token Merging: Your ViT But Faster" (Bolya et al.)
+        Args:
+            x: (n, d) tensor of token features
+            target_num: number of tokens to keep after merging
+        Returns:
+            merged tokens: (target_num, d) tensor
+        """
+        if target_num <= 0:
+            raise ValueError("target_num must be positive")
+        n, d = x.shape
+        if target_num >= n:
+            return x
+        while x.shape[0] > target_num:
+            current_n = x.shape[0]
+            if current_n < 2:
+                break
+            t1 = (current_n + 1) // 2  # ceil(n/2) - source token
+            t2 = current_n // 2  # floor(n/2) - target token
+            if t2 == 0:
+                break
+            tokens_to_remove = current_n - target_num
+            r = min(tokens_to_remove, t1)
+            if r <= 0:
+                break
+            x_batch = x.unsqueeze(0)  # (1, n, d)
+            k = x_batch / x_batch.norm(dim=-1, keepdim=True)
+            a, b = k[..., ::2, :], k[..., 1::2, :]
+            scores = a @ b.transpose(-1, -2)
+            node_max, node_idx = scores.max(dim=-1)
+            edge_idx = node_max.argsort(dim=-1, descending=True)[..., None]
+            unm_idx = edge_idx[..., r:, :]
+            src_idx = edge_idx[..., :r, :]
+            dst_idx = node_idx[..., None].gather(dim=-2, index=src_idx)
+            unm_idx = unm_idx.sort(dim=-2)[0]
+            # merge
+            src, dst = x_batch[..., ::2, :], x_batch[..., 1::2, :]
+            batch, _, c = src.shape
+            unm = src.gather(dim=-2, index=unm_idx.expand(batch, t1 - r, c))
+            src_to_merge = src.gather(dim=-2, index=src_idx.expand(batch, r, c))
+            dst = dst.scatter_add(-2, dst_idx.expand(batch, r, c), src_to_merge)
+            # pooling
+            ones = torch.ones(batch, r, 1, device=x.device, dtype=x.dtype)
+            dst_counts = torch.ones(
+                batch, dst.shape[1], 1, device=x.device, dtype=x.dtype
+            )
+            dst_counts = dst_counts.scatter_add(-2, dst_idx.expand(batch, r, 1), ones)
+            dst = dst / dst_counts
+            x = torch.cat([unm, dst], dim=-2).squeeze(0)  # (new_n, d)
+        return x
+    @staticmethod
+    def topk_compress(
+        vision_features,
+        relevance_scores,
+        is_video,
+        k=0,
+        drop_ratio=0.5,
+        strategy="topk",
+    ):
+        """
+        Drop/Truncate low-scoring segments to keep only first k tokens based on relevance scores
+        Args:
+            vision_features: (n_segment, n, d)
+            relevance_scores: n_segment (list of scores between 0 and 1)
+            k: number of tokens to keep, k=0 means drop directly
+            drop_ratio: ratio of segments to drop/truncate
+            strategy: "topk" (keep highest), "lastk" (keep lowest), "random" (random drop)
+        Return a list of tensor with length of n_segment. Each tensor is (k, d) or (num_compression_token, d)
+        """
+        if not is_video or relevance_scores is None:
+            return vision_features
+        n_segment = vision_features.shape[0]
+        if n_segment <= 1:
+            print("video segment is equal/less than 1, not compressing")
+            return vision_features
+        if strategy == "topk":
+            # in ascending order
+            sorted_indices = sorted(range(n_segment), key=lambda i: relevance_scores[i])
+        elif strategy == "lastk":
+            # in descending order
+            sorted_indices = sorted(
+                range(n_segment), key=lambda i: relevance_scores[i], reverse=True
+            )
+        elif strategy == "random":
+            # shuffle index, random
+            sorted_indices = list(range(n_segment))
+            random.shuffle(sorted_indices)
+        else:
+            raise ValueError(f"Unknown strategy: {strategy}")
+        # truncate or drop ratio
+        num_to_prune = int(n_segment * drop_ratio)
+        print(
+            f"[topk_compress] total segment: {n_segment}, drop/truncate segment: {num_to_prune}, keep first {k} token"
+        )
+        low_score_indices = set(sorted_indices[:num_to_prune])
+        result = []
+        for i in range(n_segment):
+            if i in low_score_indices:
+                result.append(vision_features[i, :k, :])  # shape: (k, d)
+            else:
+                result.append(vision_features[i])  # shape: (n, d)
+        print(
+            f"[topk_compress] segments: {n_segment} (pruned={num_to_prune}, kept={n_segment - num_to_prune}), "
+            f"kept tokens for pruned segment={k}"
+        )
+        return result
+    @staticmethod
+    def adaptive_linear_budget_allocation(
+        vision_features,
+        relevance_scores,
+        is_video,
+        max_budget=8192,
+        min_tokens=4,
+        max_tokens=None,
+        strategy="head",
+    ):
+        """
+        soft token allocation based on min-max normalized relevance scores, Uses linear mapping instead of softmax, providing more aggressive sparsity
+        Args:
+            vision_features: (n_segment, n_tokens, d)
+            relevance_scores: list/array of scores
+            is_video: bool
+            max_budget: the largest budget for a video
+            min_tokens: minimum number of tokens to allocate to each segment
+            max_tokens: maximum number of tokens to allocate to each segment
+            strategy: "head", "tail", "random", "tome"
+        Return a list of tensor with length of n_segment. Each tensor is (k, d), where k in [min_tokens, max_tokens]
+        """
+        if not is_video or relevance_scores is None:
+            return vision_features, None
+        n_segments = vision_features.shape[0]
+        n_tokens_per_segment = vision_features.shape[1]
+        max_tokens = min(max_tokens or n_tokens_per_segment, n_tokens_per_segment)
+        base_budget = n_segments * min_tokens
+        if base_budget > max_budget:
+            actual_min = max(1, max_budget // n_segments)
+            print(
+                f"[adaptive_linear_budget_allocation] Warning: budget insufficient, "
+                f"min_tokens: {min_tokens} -> {actual_min}"
+            )
+            min_tokens = actual_min
+            base_budget = n_segments * min_tokens
+        # Convert to tensor
+        scores = torch.tensor(relevance_scores, dtype=torch.float32)
+        score_min = scores.min()
+        score_max = scores.max()
+        score_range = score_max - score_min
+        if score_range < 1e-8:
+            k = min(max_tokens, max_budget // n_segments)
+            return [vision_features[i, :k, :] for i in range(n_segments)], torch.full(
+                (n_segments,), k, dtype=torch.long
+            )
+        # Normalize scores to [0, 1] range
+        normalized_scores = (scores - score_min) / score_range
+        # Linear mapping: [0, 1] -> [min_tokens, max_tokens]
+        token_range = max_tokens - min_tokens
+        ideal_allocations = (
+            min_tokens + (normalized_scores * token_range).floor().long()
+        )
+        # ========== Budget Protection ==========
+        total_desired = ideal_allocations.sum().item()
+        if total_desired > max_budget:
+            # Only scale down when total demand exceeds budget
+            print(
+                f"[adaptive_linear_budget_allocation] Warning: Desired budget "
+                f"({total_desired}) exceeds max ({max_budget}). Scaling down..."
+            )
+            extra_budget = max_budget - base_budget
+            # Prevent division by zero
+            score_sum = normalized_scores.sum().item()
+            if score_sum < 1e-8:
+                # Fallback to uniform distribution if all normalized scores are ~0
+                weights = torch.ones_like(normalized_scores) / n_segments
+            else:
+                weights = normalized_scores / score_sum
+            # Distribute extra budget proportionally
+            extra_allocation = (weights * extra_budget).floor().long()
+            remainder = int(extra_budget - extra_allocation.sum().item())
+            if remainder > 0:
+                top_indices = torch.argsort(weights, descending=True)[:remainder]
+                for idx in top_indices:
+                    extra_allocation[idx] += 1
+            allocations = min_tokens + extra_allocation
+        else:
+            allocations = ideal_allocations
+        # Final clamp to ensure bounds
+        allocations = allocations.clamp(min=min_tokens, max=max_tokens)
+        # default, head truncation
+        if strategy == "head":
+            result = []
+            for i in range(n_segments):
+                k = min(allocations[i].item(), n_tokens_per_segment)
+                result.append(vision_features[i, :k, :])
+        elif strategy == "tail":
+            result = []
+            for i in range(n_segments):
+                k = min(allocations[i].item(), n_tokens_per_segment)
+                if k == 0:
+                    result.append(vision_features[i, :0, :])
+                else:
+                    result.append(vision_features[i, -k:, :])
+        elif strategy == "random":
+            result = []
+            for i in range(n_segments):
+                k = min(allocations[i].item(), n_tokens_per_segment)
+                if k == 0:
+                    result.append(vision_features[i, :0, :])
+                else:
+                    indices = torch.randperm(
+                        n_tokens_per_segment, device=vision_features.device
+                    )[:k]
+                    indices = indices.sort().values
+                    result.append(vision_features[i, indices, :])
+        elif strategy == "tome":
+            result = []
+            for i in range(n_segments):
+                k = min(allocations[i].item(), n_tokens_per_segment)
+                if k == 0:
+                    result.append(vision_features[i, :0, :])
+                elif k == n_tokens_per_segment:
+                    result.append(vision_features[i])
+                else:
+                    # Token Merging
+                    result.append(
+                        VLMMultimodalProcessor.tome_merge(
+                            vision_features[i], target_num=k
+                        )
+                    )
+        else:
+            raise ValueError(f"Unknown strategy: {strategy}")
+        total_used = allocations.sum().item()
+        print(
+            f"[adaptive_linear_budget_allocation] segments={n_segments}, "
+            f"budget_used={total_used}/{max_budget}, "
+            f"theoretical_range=[{min_tokens}, {max_tokens}], "
+            f"actual_range=[{allocations.min().item():.0f}, {allocations.max().item():.0f}]",
+            flush=True,
+        )
+        return result, allocations