Compare commits

...

10 Commits

Author SHA1 Message Date
hueso
aec56abe73 WIP: parakeeet 2026-05-18 23:10:13 -03:00
mesonium
2e4aeb3b6f Update README 2026-05-05 05:23:23 -03:00
mesonium
e1dd4dd9fe Update README 2026-05-05 05:23:23 -03:00
mesonium
2dae45d2aa Update README 2026-05-05 05:23:23 -03:00
mesonium
24644f7fd4 Hide settings if model is not available 2026-05-05 05:23:23 -03:00
mesonium
2d7630a757 Make Faster Whisper and OpenAI Whisper work 2026-05-05 05:23:23 -03:00
mesonium
b0adecef7a Improve multi modul support and refactor 2026-05-05 05:23:23 -03:00
mesonium
aff69e5b40 Add TODOs 2026-05-05 05:23:23 -03:00
mesonium
18c4fe9361 Follow up from previous two commits 2026-05-05 05:23:23 -03:00
mesonium
1889be0323 Move model_settings into model folder 2026-05-05 05:23:23 -03:00
14 changed files with 677 additions and 539 deletions

View File

@@ -1,28 +1,49 @@
# Requirements # About
## STT Models This plugin allows you in conjuction with a _general-purpose speech recognition model_ to transcribe your voice messages to text.
### openai-whisper https://github.com/openai/whisper In order to make use of this plugin, you need to have at least one of the following models installed:
#### Installation #### OpenAI Whisper
`pip install -U openai-whisper` will install - Website: https://github.com/openai/whisper
- Installable by: `pip install -U openai-whisper`
``` #### Faster Whisper
mpmath, urllib3, tqdm, sympy, regex, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, - Website: https://github.com/SYSTRAN/faster-whisper
nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, - Installable by: `pip install -U faster-whisper`
nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, networkx,
MarkupSafe, llvmlite, fsspec, filelock, charset-normalizer, certifi, triton,
requests, nvidia-cusparse-cu12, nvidia-cudnn-cu12, numba, jinja2, tiktoken,
nvidia-cusolver-cu12, torch, openai-whisper
```
#### Models Additionally you have to checkout the following Gajim branch:
https://dev.gajim.org/mesonium/gajim/-/tree/stt_voice_messages
| Multi Langual Model | Download Size | VRAM Requirement | Relative Speed | # Hint
|---------------------|---------------| ---------------- |----------------|
| Tiny | 70 MB | ~1 GB | ~32x |
| Base | 140 MB | ~1 GB | ~16x |
| Small | 460 MB | ~2 GB | ~6x |
| Medium | 1.4 GB | ~5 GB | ~2x |
| Large | 2.9 GB | ~10 GB | ~1x |
_**The plugin is very much POC at this stage!**_
Currently a chosen model will be on first downloaded in the background, during which
Gajim's UI may not respond.
Typical model sizes are in case of OpenAI Whisper:
| Multi Langual Model | Download Size |
|---------------------|---------------|
| Tiny | 70 MB |
| Base | 140 MB |
| Small | 460 MB |
| Medium | 1.4 GB |
| Large | 2.9 GB |
# TODO
- [x] Offer multiple models
- [ ] Add various model settings
- [ ] Model receiving
- [ ] Hint model download state
- [ ] Allow to change model download location
- [ ] Allow to use local models
- [ ] Database Handling
- [ ] Store transcribed messages in a DB
- [ ] Option to delete DB
- [ ] Update UI
- [ ] Make it more pretty
- [ ] Show progress bar
- [ ] Highlight words on playback

View File

@@ -18,22 +18,23 @@ from __future__ import annotations
import logging import logging
import typing import typing
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING from typing import TYPE_CHECKING, Any
import whisper try:
from gi.repository import Gtk import onnx_asr
except ModuleNotFoundError:
if typing.TYPE_CHECKING:
import onnx_asr
from gi.repository import Adw, Gtk
from gajim.common import app
from gajim.common.app import Any
from gajim.gtk.builder import get_builder
from gajim.gtk.const import Setting, SettingKind, SettingType from gajim.gtk.const import Setting, SettingKind, SettingType
from gajim.gtk.settings import SettingsBox from gajim.gtk.filechoosers import Filter
from gajim.gtk.sidebar_switcher import SideBarSwitcher from gajim.gtk.settings import GajimPreferencesGroup, SettingsDialog
from gajim.plugins.helpers import get_builder
from gajim.plugins.plugins_i18n import _ from gajim.plugins.plugins_i18n import _
from ..model_settings import * from ..models import stt
from ..models import openai_whisper from ..models.model_settings import OnnxAsrSettings
if TYPE_CHECKING: if TYPE_CHECKING:
from ..stt_voice_messages import STTVoiceMessagesPlugin from ..stt_voice_messages import STTVoiceMessagesPlugin
@@ -41,210 +42,250 @@ if TYPE_CHECKING:
log = logging.getLogger('gajim.p.sttvm_config_dialog') log = logging.getLogger('gajim.p.sttvm_config_dialog')
SUPPORTED_MODELS: dict[str, dict[str, typing.Union[list[str], Any, str]]] = {
'model_openaiwhisper': {
'moduls': ['whisper'],
'class': openai_whisper.WhisperModel,
'name': 'OpenAI Whisper'
},
'model_ctranslate2': {
'moduls': ['ctranslate2'],
'class': None,
'name': _('CTranslate2')
},
'model_faster-whisper': {
'moduls': ['faster-whisper'],
'class': None,
'name:': _('Faster-Whisper')
},
'model_distill': {
'moduls': ['transformers', 'accelerate', 'datasets[audio]'],
'class': None,
'name': _('Distill')
}
}
class Configuration: class Configuration:
def __init__(self, plugin: STTVoiceMessagesPlugin): def __init__(self, plugin: STTVoiceMessagesPlugin):
self._plugin = plugin self._plugin = plugin
self._openaiwhisper_settings = OpenAIWhisperSettings() self._instance = None
self._available_models: dict[ self._main_model_row = None
str, dict[str, typing.Union[list[str], Any, str]]] = {} self._preset_model_picker = None
self.check_available_moduls() self._custom_model_id_entry = None
self._local_model_file_picker = None
self._status_group = None
self._model_data: dict[str, str] = {}
self._instance = stt.OnnxAsrModel()
self._instance.set_config(OnnxAsrSettings(
model_id=self.plugin.config['model_id'],
model_path=self.plugin.config['model_path']
))
self._model_data = self._steal_model_list()
@property @property
def plugin(self) -> STTVoiceMessagesPlugin: def plugin(self) -> STTVoiceMessagesPlugin:
return self._plugin return self._plugin
@property @property
def available_models(self) -> dict[str, dict[str, typing.Union[list[str], Any, str]]]: def is_available(self) -> bool:
return self._available_models return self._instance is not None
def unload_model(self) -> None:
if self._instance is not None:
self._instance.unload_now()
def _steal_model_list(self) -> dict[str, str]:
# UGLY: Extract available model choices from onnx_asr type hints.
ann = onnx_asr.load_model.__annotations__.get('model')
return {
v: v for arg in typing.get_args(ann)
for v in typing.get_args(arg)
if isinstance(v, str)
}
def on_setting(self, value: Any, data: Any) -> None: def on_setting(self, value: Any, data: Any) -> None:
if isinstance(value, str): if isinstance(value, str):
value.strip() value = value.strip()
log.debug('plugin config before:\n %s', self.plugin.config.data)
self.plugin.config[data] = value self.plugin.config[data] = value
self._plugin.config['model_instance'].on_setting(data, value)
log.debug('plugin config after:\n %s', self.plugin.config.data)
def on_set_model(self, value: Any, data: Any) -> None: def on_preset_changed(self, value: str, data: Any) -> None:
if isinstance(value, str): if self._custom_model_id_entry is not None:
value.strip() entry_text = self._custom_model_id_entry.entry.get_text().strip()
log.debug('plugin config before:\n %s', self.plugin.config.data) if entry_text:
self._update_model_status()
return # custom entry overrides; ignore preset change
self._write_model_id(value)
self._update_model_status()
self._available_models[value]['model_instance'] = self._available_models[value]['class']() def on_custom_model_id_changed(self, value: str, data: Any) -> None:
value = value.strip()
if value:
self._write_model_id(value)
elif self._preset_model_picker is not None:
preset_key = self._preset_model_picker._dropdown.get_selected_key()
if preset_key is not None:
self._write_model_id(preset_key)
self._apply_sensitivity_state()
self._update_model_status()
self.plugin.config['model_class'] = self._available_models[value][ def on_model_file_picked(self, value: str, data: Any) -> None:
'class'] self._write_model_path(str(Path(value).parent) if value else '')
self.plugin.config['model_instance'] = self._available_models[value]['model_instance'] self._apply_sensitivity_state()
self._update_model_status()
self.on_setting(value, data) def _write_model_id(self, model_id: str) -> None:
log.debug('plugin config after:\n %s', self.plugin.config.data) if self.plugin.config['model_id'] == model_id:
return
self.plugin.config['model_id'] = model_id
if self._instance is not None:
self._instance.set_config(OnnxAsrSettings(
model_id=self.plugin.config['model_id'],
model_path=self.plugin.config['model_path']
))
@staticmethod def _write_model_path(self, model_path: str) -> None:
def is_module_available(module: str) -> bool: if self.plugin.config['model_path'] == model_path:
try: return
__import__(module) self.plugin.config['model_path'] = model_path
return True if self._instance is not None:
except ModuleNotFoundError: self._instance.set_config(OnnxAsrSettings(
log.debug('Could not find module %s', module) model_id=self.plugin.config['model_id'],
return False model_path=self.plugin.config['model_path']
except ImportError as ex: ))
log.debug(str(ex))
return False
def check_available_moduls(self): def sync_model_path_from_widget(self) -> None:
for model in SUPPORTED_MODELS: if self._local_model_file_picker is None:
available = True return
for modul in SUPPORTED_MODELS[model]['moduls']: button = self._local_model_file_picker.get_activatable_widget()
if not self.is_module_available(modul): path = button.get_path()
available = False new_path = str(path.parent) if path else ''
continue self._write_model_path(new_path)
if available:
self._available_models[model] = SUPPORTED_MODELS[model]
if (self.plugin.config.data['model_class'] is None def _apply_sensitivity_state(self) -> None:
and len(self._available_models) > 0): if self._preset_model_picker is None:
model = list(self._available_models)[0] return
self.on_set_model(model, 'model') has_local = bool(self.plugin.config['model_path'])
log.debug('Choose first available model!') entry_text = (self._custom_model_id_entry.entry.get_text().strip()
if self._custom_model_id_entry else '')
has_entry = bool(entry_text)
self._custom_model_id_entry.set_sensitive(not has_local)
self._preset_model_picker.set_sensitive(not has_local and not has_entry)
def _update_model_status(self) -> None:
if self._main_model_row is None:
return
entry_text = (self._custom_model_id_entry.entry.get_text().strip()
if self._custom_model_id_entry else '')
if self.plugin.config['model_path']:
path = Path(self.plugin.config['model_path'])
summary = _('Local: {}').format(path.name or str(path))
description = _('Loading model files from {}').format(path)
if not (path / 'config.json').exists():
description += '\n' + _(
'config.json not found in this directory — onnx-asr will'
' fall back to Model preset or Custom Model ID for the'
' architecture.')
elif entry_text:
summary = _('Custom: {}').format(entry_text)
description = _('Using custom model: {}').format(entry_text)
else: else:
log.debug('Available model already chosen!') preset_key = (self._preset_model_picker._dropdown.get_selected_key()
if self._preset_model_picker else '')
summary = preset_key or _('(none)')
description = (_('Using preset: {}').format(preset_key)
if preset_key else '')
log.debug('models = %s', self._available_models) self._main_model_row._label.set_text(summary)
if self._status_group is not None:
self._status_group.set_description(description)
class STTVoiceMessagesConfigDialog(SettingsDialog):
class PreferenceBox(SettingsBox):
def __init__(self, settings: list[Setting]) -> None:
SettingsBox.__init__(self, None)
self.get_style_context().add_class('border')
self.set_selection_mode(Gtk.SelectionMode.NONE)
self.set_vexpand(False)
self.set_valign(Gtk.Align.END)
for setting in settings:
self.add_setting(setting)
self.update_states()
class STTVoiceMessagesConfigDialog(Gtk.ApplicationWindow):
def __init__(self, config: Configuration, parent: Gtk.Window) -> None: def __init__(self, config: Configuration, parent: Gtk.Window) -> None:
Gtk.ApplicationWindow.__init__(self)
self.set_application(app.app)
self.set_position(Gtk.WindowPosition.CENTER)
self.set_show_menubar(False)
self.set_name('PreferencesWindow')
self.set_default_size(900, 650)
self.set_resizable(True)
self.set_title(_('STT Voice Messages - Preferences'))
ui_path = Path(__file__).parent
self._ui = get_builder(str(ui_path.resolve() / 'config_dialog.ui'))
self._prefs: dict[str, PreferenceBox] = {}
side_bar_switcher = SideBarSwitcher()
side_bar_switcher.set_stack(self._ui.stack)
self._ui.grid.attach(side_bar_switcher, 0, 0, 1, 1)
self.add(self._ui.grid)
self.config = config self.config = config
self.plugin = self.config.plugin self.plugin = self.config.plugin
if not config.is_available:
return
prefs: list[tuple[str, type[PreferenceBox]]] = [ rows = [
('stt_behaviour', self.STTBehaviour), Setting(SettingKind.SWITCH,
('models', self.Models), _('Auto Transcribe'),
('whisper_general', self.OpenAIWhisperGeneral), SettingType.VALUE,
value=self.plugin.config['auto_transcribe'],
data='auto_transcribe',
callback=config.on_setting,
desc=_('Transcribe messages as they appear')),
Setting(SettingKind.SUBPAGE,
_('Model'),
SettingType.VALUE,
value=None,
name='main_model',
props={'subpage': 'sttvm-model'}),
] ]
self._add_prefs(prefs)
self.show_all() SettingsDialog.__init__(
self,
parent,
_('STT Voice Messages'),
Gtk.DialogFlags.MODAL,
rows,
'',
)
class STTBehaviour(PreferenceBox): config._main_model_row = self.get_setting('main_model')
def __init__(self, config_dialog: STTVoiceMessagesConfigDialog) -> None:
settings = [ use_custom = self.plugin.config['model_id'] not in config._model_data
Setting(SettingKind.SWITCH,
_('Auto Transcribe'),
SettingType.VALUE,
value=config_dialog.plugin.config['auto_transcribe'],
data='auto_transcribe',
callback=config_dialog.config.on_setting)
]
PreferenceBox.__init__(self, settings)
class Models(PreferenceBox):
def __init__(self, config_dialog: STTVoiceMessagesConfigDialog) -> None:
models: list[tuple[str, str]] = []
for key, value in config_dialog.config.available_models.items():
models.append(
(key, str(value['name']))
)
settings = [
Setting(SettingKind.COMBO,
_('Speech To Text Model'),
SettingType.VALUE,
value=config_dialog.plugin.config['model'],
data='model',
callback=config_dialog.config.on_set_model,
props={'combo_items': models},
desc=_('Choose Model to use')),
]
PreferenceBox.__init__(self, settings)
class OpenAIWhisperGeneral(PreferenceBox):
def __init__(self, config_dialog: STTVoiceMessagesConfigDialog) -> None:
settings = [
Setting(SettingKind.POPOVER,
_('Language Model Size'),
SettingType.VALUE,
value=config_dialog.plugin.config['whisperai_model_size'],
data='whisperai_model_size',
callback=config_dialog.config.on_setting,
props={'entries': whisper.available_models()}),
Setting(SettingKind.SWITCH,
_('Translate'),
SettingType.VALUE,
value=config_dialog.plugin.config['whisperai_translate'],
data='whisperai_translate',
callback=config_dialog.config.on_setting)
]
PreferenceBox.__init__(self, settings)
def _add_prefs(self, prefs: list[tuple[str, type[PreferenceBox]]]):
for ui_name, klass in prefs:
pref_box = getattr(self._ui, ui_name)
pref = klass(self) # pyright: ignore
pref_box.add(pref)
self._prefs[ui_name] = pref
subpage_rows: list[Setting] = [
Setting(SettingKind.DROPDOWN,
_('Model'),
SettingType.VALUE,
value=self.plugin.config['model_id'],
name='preset_model',
callback=config.on_preset_changed,
props={'data': config._model_data}),
Setting(SettingKind.ENTRY,
_('Custom Model'),
SettingType.VALUE,
value=self.plugin.config['model_id'] if use_custom else '',
name='custom_model',
callback=config.on_custom_model_id_changed,
desc=_('Custom HF model path or model ID')),
Setting(SettingKind.FILECHOOSER,
_('Local File'),
SettingType.VALUE,
value='',
name='local_model_file',
callback=config.on_model_file_picked,
desc=_('Model ID is taken from config.json if not set'),
props={'filefilters': [
Filter(_('ONNX model'), suffixes=['onnx'], default=True),
]}),
]
controls_group = GajimPreferencesGroup('model_controls')
for s in subpage_rows:
controls_group.add_setting(s)
status_group = Adw.PreferencesGroup()
pref_page = Adw.PreferencesPage()
pref_page.add(controls_group)
pref_page.add(status_group)
toolbar = Adw.ToolbarView(content=pref_page)
toolbar.add_top_bar(Adw.HeaderBar())
page = Adw.NavigationPage(
tag='sttvm-model', title=_('Model'), child=toolbar)
self._nav.add(page)
config._preset_model_picker = controls_group.get_setting('preset_model')
config._custom_model_id_entry = controls_group.get_setting('custom_model')
config._local_model_file_picker = controls_group.get_setting(
'local_model_file')
config._status_group = status_group
config._custom_model_id_entry.entry.set_placeholder_text(
_('onnx-community/whisper-large-v3-turbo'))
button = config._local_model_file_picker.get_activatable_widget()
button._label_text = _('.oonx')
button.reset()
if self.plugin.config['model_path']:
onnx_in_dir = next(iter(Path(self.plugin.config['model_path']).glob('*.onnx')),
None)
if onnx_in_dir is not None:
button.set_path(onnx_in_dir)
config._update_model_status()
config._apply_sensitivity_state()
def _cleanup(self) -> None:
self.config.sync_model_path_from_widget()
self.config._main_model_row = None
self.config._preset_model_picker = None
self.config._custom_model_id_entry = None
self.config._local_model_file_picker = None
self.config._status_group = None
SettingsDialog._cleanup(self)

View File

@@ -1,271 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!-- Generated with glade 3.40.0 -->
<interface>
<requires lib="gtk+" version="3.20"/>
<!-- n-columns=3 n-rows=3 -->
<object class="GtkGrid" id="grid">
<property name="visible">True</property>
<property name="can-focus">False</property>
<child>
<object class="GtkStack" id="stack">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="hexpand">True</property>
<child>
<object class="GtkScrolledWindow">
<property name="visible">True</property>
<property name="can-focus">True</property>
<property name="hscrollbar-policy">never</property>
<property name="shadow-type">in</property>
<property name="overlay-scrolling">False</property>
<child>
<object class="GtkViewport">
<property name="visible">True</property>
<property name="can-focus">False</property>
<child>
<object class="GtkBox">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="orientation">vertical</property>
<property name="spacing">24</property>
<child>
<!-- n-columns=1 n-rows=1 -->
<object class="GtkGrid" id="stt_behaviour">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="orientation">vertical</property>
<property name="row-spacing">12</property>
<child>
<object class="GtkLabel">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="label" translatable="yes">Behaviour of STT Voice Messages</property>
<property name="xalign">0</property>
<style>
<class name="bold"/>
</style>
</object>
<packing>
<property name="left-attach">0</property>
<property name="top-attach">0</property>
</packing>
</child>
</object>
<packing>
<property name="expand">False</property>
<property name="fill">True</property>
<property name="position">0</property>
</packing>
</child>
<child>
<!-- n-columns=1 n-rows=1 -->
<object class="GtkGrid" id="models">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="orientation">vertical</property>
<property name="row-spacing">12</property>
<child>
<object class="GtkLabel">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="label" translatable="yes">General Model Configuration</property>
<property name="xalign">0</property>
<style>
<class name="bold"/>
</style>
</object>
<packing>
<property name="left-attach">0</property>
<property name="top-attach">0</property>
</packing>
</child>
</object>
<packing>
<property name="expand">False</property>
<property name="fill">True</property>
<property name="position">1</property>
</packing>
</child>
<child>
<!-- n-columns=1 n-rows=1 -->
<object class="GtkGrid" id="file_preview">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="orientation">vertical</property>
<property name="row-spacing">12</property>
<child>
<object class="GtkLabel">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="label" translatable="yes">Preview UI</property>
<property name="xalign">0</property>
<style>
<class name="bold"/>
</style>
</object>
<packing>
<property name="left-attach">0</property>
<property name="top-attach">0</property>
</packing>
</child>
</object>
<packing>
<property name="expand">False</property>
<property name="fill">True</property>
<property name="position">2</property>
</packing>
</child>
</object>
</child>
</object>
</child>
</object>
<packing>
<property name="name">general</property>
<property name="title" translatable="yes">General</property>
<property name="icon-name">computer-symbolic</property>
</packing>
</child>
<child>
<object class="GtkScrolledWindow">
<property name="visible">True</property>
<property name="can-focus">True</property>
<property name="hscrollbar-policy">never</property>
<property name="shadow-type">in</property>
<property name="overlay-scrolling">False</property>
<child>
<object class="GtkViewport">
<property name="visible">True</property>
<property name="can-focus">False</property>
<child>
<object class="GtkBox">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="orientation">vertical</property>
<property name="spacing">24</property>
<child>
<!-- n-columns=3 n-rows=3 -->
<object class="GtkGrid" id="whisper_general">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="orientation">vertical</property>
<property name="row-spacing">12</property>
<child>
<object class="GtkLabel">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="label" translatable="yes">General</property>
<property name="xalign">0</property>
<style>
<class name="bold"/>
</style>
</object>
<packing>
<property name="left-attach">0</property>
<property name="top-attach">0</property>
</packing>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
</object>
<packing>
<property name="expand">False</property>
<property name="fill">True</property>
<property name="position">0</property>
</packing>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
</object>
</child>
</object>
</child>
</object>
<packing>
<property name="name">openai-whisper</property>
<property name="title" translatable="yes">openAI Whisper</property>
<property name="position">1</property>
</packing>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<style>
<class name="settings-stack"/>
</style>
</object>
<packing>
<property name="left-attach">1</property>
<property name="top-attach">0</property>
</packing>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
</object>
</interface>

View File

@@ -13,9 +13,12 @@
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with Gajim. If not, see <http://www.gnu.org/licenses/>. # along with Gajim. If not, see <http://www.gnu.org/licenses/>.
import logging from __future__ import annotations
from gi.repository import Gtk import logging
from pathlib import Path
from gi.repository import Gtk, Adw
from gajim.plugins.gajimplugin import GajimPluginConfig from gajim.plugins.gajimplugin import GajimPluginConfig
from gajim.plugins.plugins_i18n import _ from gajim.plugins.plugins_i18n import _
@@ -26,48 +29,62 @@ log = logging.getLogger('gajim.p.stt_voice_messages_sttbox')
class STTBox(Gtk.Box): class STTBox(Gtk.Box):
def __init__(self, def __init__(self,
preview_audio_widget: Gtk.Box,
config: GajimPluginConfig, config: GajimPluginConfig,
audio_file: str, audio_file: Path,
) -> None: ) -> None:
Gtk.Box.__init__(self, orientation=Gtk.Orientation.VERTICAL, spacing=12) Gtk.Box.__init__(self, orientation=Gtk.Orientation.HORIZONTAL, spacing=6)
self._config = config self._config = config
self._preview_audio = preview_audio_widget
self._model = None self._model = None
self._audio_file = audio_file self._audio_file = audio_file
self._text = '' self._text = ''
self._transcribe_button = Gtk.Button(label=_('Transcribe')) self._transcribe_button = Gtk.Button.new_from_icon_name("lucide-captions-symbolic")
self._transcribe_button.set_tooltip_text(_('Transcribe voice message'))
self._spinner = Adw.Spinner(valign=Gtk.Align.START, visible=False)
self._transcription_label = Gtk.Label( self._transcription_label = Gtk.Label(
label=_('Nothing transcribed yet')) label=_('Nothing transcribed yet'))
self._transcription_label.set_max_width_chars(40) self._transcription_label.set_max_width_chars(40)
self._transcription_label.set_line_wrap(True) self._transcription_label.set_wrap(True)
self.add(self._transcribe_button)
self.add(self._transcription_label)
self.append(self._spinner)
self.append(self._transcription_label)
self._transcribe_button.connect('clicked', self._on_transcribe_clicked)
self._result = helper.Results('') self._result = helper.Results('')
self._transcribe_button.connect('clicked', self._on_transcribe_clicked) @property
def button(self) -> Gtk.Button:
self.show_all() return self._transcribe_button
def _on_transcribe_clicked(self, _button: Gtk.Button) -> None: def _on_transcribe_clicked(self, _button: Gtk.Button) -> None:
log.debug('config.data = %s', self._config.data) log.debug('config._instance = %s', self._config._instance)
model = self._config.data['model_instance'] self._model = self._config._instance
if model is None: if self._model is None:
return return
self._model = model if self._model.is_loaded:
text = _('Transcribing…')
elif self._model.will_download:
text = _('Downloading ') + self._model.model_id
else:
text = _('Loading model…')
self._transcription_label.set_text(text)
self._spinner.set_visible(True)
self._task = helper.BackgroundTask(
self._model.load, self._on_load_done)
self._task.start()
transcription_task = helper.BackgroundTask( def _on_load_done(self):
self._model.transcribe(self._result, self._audio_file), self._transcription_label.set_text(_('Transcribing…'))
self._show_result self._task = helper.BackgroundTask(
lambda: self._model.recognize(
self._result, helper.load_audio(self._audio_file)),
self._show_result,
) )
transcription_task.start() self._task.start()
def _show_result(self): def _show_result(self):
assert self._model is not None assert self._model is not None
@@ -76,3 +93,4 @@ class STTBox(Gtk.Box):
self._transcription_label.set_text(self._text.strip()) self._transcription_label.set_text(self._text.strip())
else: else:
self._transcription_label.set_text(_('_Have not heard any word!_')) self._transcription_label.set_text(_('_Have not heard any word!_'))
self._spinner.set_visible(False)

View File

@@ -13,16 +13,53 @@
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with Gajim. If not, see <http://www.gnu.org/licenses/>. # along with Gajim. If not, see <http://www.gnu.org/licenses/>.
import logging
import typing
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path
import gi
import numpy as np
from gi.repository import Gio, GObject from gi.repository import Gio, GObject
try:
gi.require_version('Gst', '1.0')
from gi.repository import Gst
except Exception:
if typing.TYPE_CHECKING:
from gi.repository import Gst
log = logging.getLogger('gajim.p.sttvm_helper')
@dataclass @dataclass
class Results: class Results:
text: str text: str
def load_audio(path: Path, sample_rate: int = 16000) -> np.ndarray:
Gst.init(None)
pipeline = Gst.parse_launch(
'filesrc name=src ! decodebin ! audioconvert ! audioresample ! '
f'audio/x-raw,format=F32LE,rate={sample_rate},channels=1 ! '
'appsink name=sink sync=false'
)
pipeline.get_by_name('src').set_property('location', str(path))
sink = pipeline.get_by_name('sink')
chunks: list[np.ndarray] = []
pipeline.set_state(Gst.State.PLAYING)
while (sample := sink.emit('try-pull-sample', 10 * Gst.SECOND)) is not None:
buf = sample.get_buffer()
_, info = buf.map(Gst.MapFlags.READ)
chunks.append(np.frombuffer(bytes(info.data), dtype=np.float32))
buf.unmap(info)
pipeline.set_state(Gst.State.NULL)
if not chunks:
raise RuntimeError(f'Could not decode audio: {path}')
return np.concatenate(chunks)
''' '''
https://discourse.gnome.org/t/gtk-threading-problem-with-glib-idle-add/13597/5 https://discourse.gnome.org/t/gtk-threading-problem-with-glib-idle-add/13597/5
@@ -57,6 +94,7 @@ class BackgroundTask(GObject.Object):
retval = self.function() retval = self.function()
task.return_value(retval) task.return_value(retval)
except Exception as e: except Exception as e:
log.exception('Background task failed')
task.return_value(e) task.return_value(e)
def finish(self): def finish(self):

View File

@@ -18,7 +18,7 @@ from pathlib import Path
from gajim.gtk.const import Setting from gajim.gtk.const import Setting
from .model import Model from .model_template import Model
try: try:
import ctranslate2 import ctranslate2

View File

@@ -0,0 +1,60 @@
# This file is part of Gajim.
#
# Gajim is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Gajim is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Gajim. If not, see <http://www.gnu.org/licenses/>.
import logging
import typing
from dataclasses import dataclass
from pathlib import Path
from ..helper import Results
from .model_settings import FasterWhisperSettings
from .model_template import Model
log = logging.getLogger('gajim.p.sttvm_faster_whisper')
try:
import faster_whisper
except ModuleNotFoundError:
if typing.TYPE_CHECKING:
import faster_whisper
@dataclass
class Configuration:
model_size: str
class FasterWhisperModel(Model):
def __init__(self):
self._result: str = ''
self._config = FasterWhisperSettings()
@property
def result(self) -> str:
return self._result
def transcribe(self, result: Results, audio_file: Path) -> None:
model = faster_whisper.WhisperModel(self._config.model_size, compute_type="float32")
log.debug('model size is used = %s', self._config.model_size)
segments, _ = model.transcribe(audio_file)
segments = list(segments)
result.text = ''
log.debug('segments = %s', segments)
for segment in segments:
result.text += segment.text
print("[%.2fs -> %.2fs] %s" % (
segment.start, segment.end, segment.text))
def set_config(self, config: FasterWhisperSettings) -> None:
self._config = config

View File

@@ -18,6 +18,6 @@ from dataclasses import dataclass, field
@dataclass @dataclass
class OpenAIWhisperSettings: class OnnxAsrSettings:
whisperai_model_size: str = field(default='tiny', init=True) model_id: str = field(default='nemo-parakeet-tdt-0.6b-v3', init=True)
model_path: str = ''

View File

@@ -14,19 +14,28 @@
# along with Gajim. If not, see <http://www.gnu.org/licenses/>. # along with Gajim. If not, see <http://www.gnu.org/licenses/>.
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from pathlib import Path from typing import Any
from gajim.gtk.const import Setting import numpy as np
from ..helper import Results from ..helper import Results
class Model(ABC): class Model(ABC):
@property
@abstractmethod @abstractmethod
def transcribe(self, result: Results, audio_file: Path) -> str: def is_loaded(self) -> bool:
return '' pass
@abstractmethod @abstractmethod
def on_setting(self, setting: Setting): def load(self) -> None:
pass pass
@abstractmethod
def recognize(self, result: Results, audio: np.ndarray) -> None:
pass
@abstractmethod
def set_config(self, config: Any) -> None:
pass

View File

@@ -19,8 +19,8 @@ from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from ..helper import Results from ..helper import Results
from ..model_settings import OpenAIWhisperSettings from .model_settings import OpenAIWhisperSettings
from .model import Model from .model_template import Model
log = logging.getLogger('gajim.p.sttvm_whisper') log = logging.getLogger('gajim.p.sttvm_whisper')
@@ -37,7 +37,6 @@ class Configuration:
class WhisperModel(Model): class WhisperModel(Model):
def __init__(self): def __init__(self):
# TODO
self._result: str = '' self._result: str = ''
self._config = OpenAIWhisperSettings() self._config = OpenAIWhisperSettings()
@@ -45,12 +44,11 @@ class WhisperModel(Model):
def result(self) -> str: def result(self) -> str:
return self._result return self._result
def transcribe(self, result: Results, audio_file: Path) -> str: def transcribe(self, result: Results, audio_file: Path) -> None:
model = whisper.load_model(self._config['whisperai_model_size']) model = whisper.load_model(self._config.model_size)
log.debug('model size is used = %s', self._config['whisperai_model_size']) log.debug('model size is used = %s', self._config.model_size)
result.text = model.transcribe(audio_file)['text'] result.text = model.transcribe(audio_file)['text'] # pyright: ignore [reportAttributeAccessIssue]
def on_setting(self, key, value): def set_config(self, config: OpenAIWhisperSettings) -> None:
log.debug('key = %s, value = %s', key, value) self._config = config
self._config[key] = value

View File

@@ -0,0 +1,132 @@
# This file is part of Gajim.
#
# Gajim is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Gajim is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Gajim. If not, see <http://www.gnu.org/licenses/>.
import logging
import pickle
import subprocess
import sys
from pathlib import Path
import numpy as np
from gi.repository import GLib
from ..helper import Results
from .model_settings import OnnxAsrSettings
from .model_template import Model
log = logging.getLogger('gajim.p.sttvm_onnx_asr')
_IDLE_UNLOAD_SECONDS = 300
class OnnxAsrModel(Model):
def __init__(self):
self._proc = None
self._loaded = False
self._config = OnnxAsrSettings()
self._unload_source = None
@property
def is_loaded(self) -> bool:
return self._loaded
@property
def will_download(self) -> bool:
if self.is_loaded or self._config.model_path:
return False
from huggingface_hub import try_to_load_from_cache
from onnx_asr.resolver import model_repos
repo = model_repos.get(self._config.model_id, self._config.model_id)
if '/' not in repo:
return False
return not isinstance(try_to_load_from_cache(repo, 'config.json'), str)
def load(self) -> None:
if self._loaded:
self._reschedule_unload()
return
log.debug('Loading model %s in worker', self._config.model_id)
self._send({
'op': 'load',
'model_id': self._config.model_id,
'model_path': self._config.model_path,
})
self._loaded = True
self._reschedule_unload()
def recognize(self, result: Results, audio: np.ndarray) -> None:
self.load()
response = self._send({'op': 'recognize', 'audio': audio})
result.text = response['text']
self._reschedule_unload()
def set_config(self, config: OnnxAsrSettings) -> None:
if (config.model_id != self._config.model_id
or config.model_path != self._config.model_path):
self.unload_now()
self._config = OnnxAsrSettings(
model_id=config.model_id, model_path=config.model_path)
def unload_now(self) -> None:
if self._unload_source is not None:
GLib.source_remove(self._unload_source)
self._unload_source = None
if self._proc is not None:
log.debug('Terminating STT worker subprocess')
try:
self._proc.stdin.close()
self._proc.wait(timeout=2)
except subprocess.TimeoutExpired:
self._proc.kill()
self._proc.wait()
self._proc = None
self._loaded = False
def _ensure_proc(self) -> None:
if self._proc is not None and self._proc.poll() is None:
return
log.debug('Starting STT worker subprocess')
self._proc = subprocess.Popen(
[sys.executable, str(Path(__file__).parent / 'stt_worker.py')],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
)
self._loaded = False
def _send(self, cmd: dict) -> dict:
self._ensure_proc()
pickle.dump(cmd, self._proc.stdin)
self._proc.stdin.flush()
try:
response = pickle.load(self._proc.stdout)
except EOFError as e:
self._proc = None
self._loaded = False
raise RuntimeError('Worker subprocess exited unexpectedly') from e
if not response.get('ok'):
raise RuntimeError(response.get('error', 'unknown worker error'))
return response
def _reschedule_unload(self) -> None:
if self._unload_source is not None:
GLib.source_remove(self._unload_source)
self._unload_source = GLib.timeout_add_seconds(
_IDLE_UNLOAD_SECONDS, self._on_idle_unload)
def _on_idle_unload(self) -> bool:
self._unload_source = None
log.debug('Idle unload after %ds', _IDLE_UNLOAD_SECONDS)
self.unload_now()
return GLib.SOURCE_REMOVE

View File

@@ -0,0 +1,54 @@
# This file is part of Gajim.
#
# Gajim is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Gajim is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Gajim. If not, see <http://www.gnu.org/licenses/>.
import pickle
import sys
import traceback
def _respond(response: dict) -> None:
pickle.dump(response, sys.stdout.buffer)
sys.stdout.buffer.flush()
def main() -> None:
model = None
while True:
try:
cmd = pickle.load(sys.stdin.buffer)
except EOFError:
return
try:
op = cmd['op']
if op == 'load':
import onnx_asr
model = onnx_asr.load_model(
cmd['model_id'], cmd.get('model_path') or None)
_respond({'ok': True})
elif op == 'recognize':
text = model.recognize(cmd['audio'])
_respond({'ok': True, 'text': text})
else:
_respond({'ok': False, 'error': f'unknown op: {op}'})
except Exception as e:
_respond({
'ok': False,
'error': f'{type(e).__name__}: {e}',
'traceback': traceback.format_exc(),
})
if __name__ == '__main__':
main()

View File

@@ -13,7 +13,7 @@
"win32" "win32"
], ],
"requirements": [ "requirements": [
"gajim>=1.9.0" "gajim>=2.0.0"
], ],
"short_name": "stt_voice_messages", "short_name": "stt_voice_messages",
"version": "0.0.1" "version": "0.0.1"

View File

@@ -15,51 +15,89 @@
from __future__ import annotations from __future__ import annotations
import logging
from functools import partial from functools import partial
from pathlib import Path
from gi.repository import GLib, Gtk
from gajim.common import app
from gajim.plugins import GajimPlugin from gajim.plugins import GajimPlugin
from gajim.plugins.plugins_i18n import _ from gajim.plugins.plugins_i18n import _
from .gtk.config_dialog import * from .gtk.config_dialog import Configuration, STTVoiceMessagesConfigDialog
from .gtk.sttbox import STTBox from .gtk.sttbox import STTBox
from .models.model_settings import OnnxAsrSettings
log = logging.getLogger('gajim.p.stt_voice_messages') log = logging.getLogger('gajim.p.stt_voice_messages')
_FOCUS_LOSS_UNLOAD_SECONDS = 30
class STTVoiceMessagesPlugin(GajimPlugin): class STTVoiceMessagesPlugin(GajimPlugin):
def init(self) -> None: def init(self) -> None:
self.description = _('Transcribes voice messages to text.') self.description = _('Transcribes voice messages to text.')
self.config_default_values = {
'auto_transcribe': (False, ''),
'model_id': ('nemo-parakeet-tdt-0.6b-v3', ''),
'model_path': ('', ''),
}
self._config = Configuration(self) self._config = Configuration(self)
self._config.check_available_moduls()
self.config_dialog = partial(STTVoiceMessagesConfigDialog, self._config) self.config_dialog = partial(STTVoiceMessagesConfigDialog, self._config)
self.gui_extension_points = { self.gui_extension_points = {
'preview_audio': (self._on_preview_audio_created, None), 'preview_audio': (self._on_preview_audio_created, None),
} }
self.config_default_values = { self._active_handler_id = 0
'auto_transcribe': (False, ''), self._focus_unload_source = None
'model': ('', ''),
'model_class': (None, ''),
'whisperai_model_size': ('tiny', ''),
'whisperai_translate': (False, ''),
}
self._audio_file: str = '' def activate(self) -> None:
self._preview_audio_widget = None if app.window is not None and self._active_handler_id == 0:
self._stt_box = None self._active_handler_id = app.window.connect(
'notify::is-active', self._on_window_active_changed)
def deactivate(self) -> None:
if self._focus_unload_source is not None:
GLib.source_remove(self._focus_unload_source)
self._focus_unload_source = None
if self._active_handler_id != 0 and app.window is not None:
app.window.disconnect(self._active_handler_id)
self._active_handler_id = 0
if self._config.is_available:
self._config.unload_model()
def _on_window_active_changed(self,
window: Gtk.Window,
_pspec: object,
) -> None:
if window.is_active():
if self._focus_unload_source is not None:
GLib.source_remove(self._focus_unload_source)
self._focus_unload_source = None
elif self._focus_unload_source is None:
self._focus_unload_source = GLib.timeout_add_seconds(
_FOCUS_LOSS_UNLOAD_SECONDS, self._on_focus_unload_fired)
def _on_focus_unload_fired(self) -> bool:
self._focus_unload_source = None
if self._config.is_available:
self._config.unload_model()
return GLib.SOURCE_REMOVE
def _on_preview_audio_created(self, def _on_preview_audio_created(self,
preview_audio_widget: Gtk.Box, drawing_box: Gtk.Box,
control_box: Gtk.Box,
audio_file: Path audio_file: Path
) -> None: ) -> None:
self._preview_audio_widget = preview_audio_widget self._drawing_box = drawing_box;
self._control_box = control_box;
self._audio_file = audio_file.as_posix() self._audio_file = audio_file.as_posix()
self._create_stt_box() self._create_stt_box()
def _create_stt_box(self) -> None: def _create_stt_box(self) -> None:
assert self._preview_audio_widget is not None self._stt_box = STTBox(self._config, self._audio_file)
self._stt_box = STTBox(self._preview_audio_widget, self._control_box.append(self._stt_box.button)
self.config, self._drawing_box.append(self._stt_box)
self._audio_file)
self._preview_audio_widget.pack_end(self._stt_box, False, False, 0)