Model Overview

Model Summary

The Moonshine models are designed for English speech recognition, capable of transcribing spoken audio into accurate English text. Developed by Useful Sensors, these models align with the company’s focus on real-time transcription solutions built for low-cost, resource-efficient hardware. Moonshine is available in two variants, each offering a different balance of size and performance, outlined in the presets table below.

Weights are released under the MIT License . Keras model code is released under the Apache 2 License.

Links

Installation

Keras and KerasHub can be installed with:

pip install -U -q keras-hub
pip install -U -q keras

Jax, TensorFlow, and Torch come preinstalled in Kaggle Notebooks. For instructions on installing them in another environment see the Keras Getting Started page.

Presets

The following model checkpoints are provided by the Keras team. Full code examples for each are available below.

Preset name Parameters Description
moonshine_base_en 61.5M For real-time transcription, Moonshine Base is a portable, powerful English voice recognition model. It is perfect for applications where accuracy and speed are crucial since it provides excellent accuracy with extremely low latency.
moonshine_tiny_en 27.1M For real-time transcription, Moonshine Tiny is a compact and efficient English voice recognition model. It’s ideal for resource-constrained applications where low latency and reliable accuracy are essential.

Example Usage

import os

import keras
import keras_hub
import numpy as np
import librosa
import tensorflow as tf

from keras_hub.src.models.moonshine.moonshine_audio_to_text import (
    MoonshineAudioToText,
)

# Custom backbone.
backbone = keras_hub.models.MoonshineBackbone(
    vocabulary_size=10000,
    filter_dim=256,
    encoder_num_layers=6,
    decoder_num_layers=6,
    hidden_dim=256,
    intermediate_dim=512,
    encoder_num_heads=8,
    decoder_num_heads=8,
    feedforward_expansion_factor=4,
    decoder_use_swiglu_activation=True,
    encoder_use_swiglu_activation=False,
)
# Audio features as input (e.g., from MoonshineAudioConverter).
outputs = backbone(
    {
        "encoder_input_values": np.zeros((1, 16000, 1)),
        "encoder_padding_mask": np.ones((1, 16000), dtype=bool),
        "decoder_token_ids": np.zeros((1, 20), dtype=np.int32),
        "decoder_padding_mask": np.ones((1, 20), dtype=bool),
    }
)

# Config for test.
BATCH_SIZE = 2
AUDIO_PATH = "path/to/audio_file.wav"

# Load and prepare audio data.
audio, sr = librosa.load(AUDIO_PATH, sr=16000, mono=True)
audio_tensor = tf.expand_dims(audio, axis=-1)
audio_tensor = tf.convert_to_tensor(audio_tensor, dtype=tf.float32)
single_audio_input_batched = tf.expand_dims(audio_tensor, axis=0)
audio_batch = tf.repeat(single_audio_input_batched, BATCH_SIZE, axis=0)
dummy_texts = ["Sample transcription.", "Another sample transcription."]

# Create tf.data.Dataset.
audio_ds = tf.data.Dataset.from_tensor_slices(audio_batch)
text_ds = tf.data.Dataset.from_tensor_slices(dummy_texts)
audio_dataset = (
    tf.data.Dataset.zip((audio_ds, text_ds))
    .map(lambda audio, txt: {"audio": audio, "text": txt})
    .batch(BATCH_SIZE)
)
print("Audio dataset created.")

# Load pretrained Moonshine model.
audio_to_text = MoonshineAudioToText.from_preset("moonshine_tiny_en")

# Generation examples.
generated_text_single = audio_to_text.generate(
    {"audio": single_audio_input_batched}
)
print(f"Generated text (single audio): {generated_text_single}")

generated_text_batch = audio_to_text.generate({"audio": audio_batch})
print(f"Generated text (batch audio): {generated_text_batch}")

# Compile the generate() function with a custom sampler.
audio_to_text.compile(sampler="top_k")
generated_text_top_k = audio_to_text.generate(
    {"audio": single_audio_input_batched}
)
print(f"Generated text (top_k sampler): {generated_text_top_k}")

audio_to_text.compile(sampler="greedy")
generated_text_greedy = audio_to_text.generate(
    {"audio": single_audio_input_batched}
)
print(f"Generated text (greedy sampler): {generated_text_greedy}")

# Fine-tuning example.
audio_to_text.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-5),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
)
history = audio_to_text.fit(audio_dataset, steps_per_epoch=1, epochs=1)
print(f"Fine-tuning completed. Training history: {history.history}")

# Detached preprocessing.
original_preprocessor = audio_to_text.preprocessor
audio_to_text.preprocessor = None
preprocessed_batch = original_preprocessor.generate_preprocess(
    {"audio": audio_batch}
)
print(f"Preprocessed batch keys: {preprocessed_batch.keys()}")
stop_ids = (original_preprocessor.tokenizer.end_token_id,)
generated_batch_tokens = audio_to_text.generate(
    preprocessed_batch, stop_token_ids=stop_ids
)
print(f"Generated tokens keys: {generated_batch_tokens.keys()}")
final_strings = original_preprocessor.generate_postprocess(
    generated_batch_tokens
)
print(f"Final generated strings (detached): {final_strings}")
audio_to_text.preprocessor = original_preprocessor
print("Preprocessor reattached.")

Example Usage with Hugging Face URI

import os

import keras
import keras_hub
import numpy as np
import librosa
import tensorflow as tf

from keras_hub.src.models.moonshine.moonshine_audio_to_text import (
    MoonshineAudioToText,
)

# Custom backbone.
backbone = keras_hub.models.MoonshineBackbone(
    vocabulary_size=10000,
    filter_dim=256,
    encoder_num_layers=6,
    decoder_num_layers=6,
    hidden_dim=256,
    intermediate_dim=512,
    encoder_num_heads=8,
    decoder_num_heads=8,
    feedforward_expansion_factor=4,
    decoder_use_swiglu_activation=True,
    encoder_use_swiglu_activation=False,
)
# Audio features as input (e.g., from MoonshineAudioConverter).
outputs = backbone(
    {
        "encoder_input_values": np.zeros((1, 16000, 1)),
        "encoder_padding_mask": np.ones((1, 16000), dtype=bool),
        "decoder_token_ids": np.zeros((1, 20), dtype=np.int32),
        "decoder_padding_mask": np.ones((1, 20), dtype=bool),
    }
)

# Config for test.
BATCH_SIZE = 2
AUDIO_PATH = "path/to/audio_file.wav"

# Load and prepare audio data.
audio, sr = librosa.load(AUDIO_PATH, sr=16000, mono=True)
audio_tensor = tf.expand_dims(audio, axis=-1)
audio_tensor = tf.convert_to_tensor(audio_tensor, dtype=tf.float32)
single_audio_input_batched = tf.expand_dims(audio_tensor, axis=0)
audio_batch = tf.repeat(single_audio_input_batched, BATCH_SIZE, axis=0)
dummy_texts = ["Sample transcription.", "Another sample transcription."]

# Create tf.data.Dataset.
audio_ds = tf.data.Dataset.from_tensor_slices(audio_batch)
text_ds = tf.data.Dataset.from_tensor_slices(dummy_texts)
audio_dataset = (
    tf.data.Dataset.zip((audio_ds, text_ds))
    .map(lambda audio, txt: {"audio": audio, "text": txt})
    .batch(BATCH_SIZE)
)
print("Audio dataset created.")

# Load pretrained Moonshine model.
audio_to_text = MoonshineAudioToText.from_preset("hf://keras/moonshine_tiny_en")

# Generation examples.
generated_text_single = audio_to_text.generate(
    {"audio": single_audio_input_batched}
)
print(f"Generated text (single audio): {generated_text_single}")

generated_text_batch = audio_to_text.generate({"audio": audio_batch})
print(f"Generated text (batch audio): {generated_text_batch}")

# Compile the generate() function with a custom sampler.
audio_to_text.compile(sampler="top_k")
generated_text_top_k = audio_to_text.generate(
    {"audio": single_audio_input_batched}
)
print(f"Generated text (top_k sampler): {generated_text_top_k}")

audio_to_text.compile(sampler="greedy")
generated_text_greedy = audio_to_text.generate(
    {"audio": single_audio_input_batched}
)
print(f"Generated text (greedy sampler): {generated_text_greedy}")

# Fine-tuning example.
audio_to_text.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-5),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
)
history = audio_to_text.fit(audio_dataset, steps_per_epoch=1, epochs=1)
print(f"Fine-tuning completed. Training history: {history.history}")

# Detached preprocessing.
original_preprocessor = audio_to_text.preprocessor
audio_to_text.preprocessor = None
preprocessed_batch = original_preprocessor.generate_preprocess(
    {"audio": audio_batch}
)
print(f"Preprocessed batch keys: {preprocessed_batch.keys()}")
stop_ids = (original_preprocessor.tokenizer.end_token_id,)
generated_batch_tokens = audio_to_text.generate(
    preprocessed_batch, stop_token_ids=stop_ids
)
print(f"Generated tokens keys: {generated_batch_tokens.keys()}")
final_strings = original_preprocessor.generate_postprocess(
    generated_batch_tokens
)
print(f"Final generated strings (detached): {final_strings}")
audio_to_text.preprocessor = original_preprocessor
print("Preprocessor reattached.")
Downloads last month
13
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Collection including keras/moonshine_tiny_en

Paper for keras/moonshine_tiny_en