Compare commits

...

10 Commits

Author SHA1 Message Date
root
db76ac34f0 WIP: parakeet 2026-05-18 23:07:34 -03:00
mesonium
2e4aeb3b6f Update README 2026-05-05 05:23:23 -03:00
mesonium
e1dd4dd9fe Update README 2026-05-05 05:23:23 -03:00
mesonium
2dae45d2aa Update README 2026-05-05 05:23:23 -03:00
mesonium
24644f7fd4 Hide settings if model is not available 2026-05-05 05:23:23 -03:00
mesonium
2d7630a757 Make Faster Whisper and OpenAI Whisper work 2026-05-05 05:23:23 -03:00
mesonium
b0adecef7a Improve multi modul support and refactor 2026-05-05 05:23:23 -03:00
mesonium
aff69e5b40 Add TODOs 2026-05-05 05:23:23 -03:00
mesonium
18c4fe9361 Follow up from previous two commits 2026-05-05 05:23:23 -03:00
mesonium
1889be0323 Move model_settings into model folder 2026-05-05 05:23:23 -03:00
14 changed files with 677 additions and 539 deletions

View File

@@ -1,28 +1,49 @@
# Requirements
# About
## STT Models
This plugin allows you in conjuction with a _general-purpose speech recognition model_ to transcribe your voice messages to text.
### openai-whisper https://github.com/openai/whisper
In order to make use of this plugin, you need to have at least one of the following models installed:
#### Installation
`pip install -U openai-whisper` will install
#### OpenAI Whisper
- Website: https://github.com/openai/whisper
- Installable by: `pip install -U openai-whisper`
```
mpmath, urllib3, tqdm, sympy, regex, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12,
nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12,
nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, networkx,
MarkupSafe, llvmlite, fsspec, filelock, charset-normalizer, certifi, triton,
requests, nvidia-cusparse-cu12, nvidia-cudnn-cu12, numba, jinja2, tiktoken,
nvidia-cusolver-cu12, torch, openai-whisper
```
#### Faster Whisper
- Website: https://github.com/SYSTRAN/faster-whisper
- Installable by: `pip install -U faster-whisper`
#### Models
Additionally you have to checkout the following Gajim branch:
https://dev.gajim.org/mesonium/gajim/-/tree/stt_voice_messages
| Multi Langual Model | Download Size | VRAM Requirement | Relative Speed |
|---------------------|---------------| ---------------- |----------------|
| Tiny | 70 MB | ~1 GB | ~32x |
| Base | 140 MB | ~1 GB | ~16x |
| Small | 460 MB | ~2 GB | ~6x |
| Medium | 1.4 GB | ~5 GB | ~2x |
| Large | 2.9 GB | ~10 GB | ~1x |
# Hint
_**The plugin is very much POC at this stage!**_
Currently a chosen model will be on first downloaded in the background, during which
Gajim's UI may not respond.
Typical model sizes are in case of OpenAI Whisper:
| Multi Langual Model | Download Size |
|---------------------|---------------|
| Tiny | 70 MB |
| Base | 140 MB |
| Small | 460 MB |
| Medium | 1.4 GB |
| Large | 2.9 GB |
# TODO
- [x] Offer multiple models
- [ ] Add various model settings
- [ ] Model receiving
- [ ] Hint model download state
- [ ] Allow to change model download location
- [ ] Allow to use local models
- [ ] Database Handling
- [ ] Store transcribed messages in a DB
- [ ] Option to delete DB
- [ ] Update UI
- [ ] Make it more pretty
- [ ] Show progress bar
- [ ] Highlight words on playback

View File

@@ -18,22 +18,23 @@ from __future__ import annotations
import logging
import typing
from pathlib import Path
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any
import whisper
from gi.repository import Gtk
try:
import onnx_asr
except ModuleNotFoundError:
if typing.TYPE_CHECKING:
import onnx_asr
from gi.repository import Adw, Gtk
from gajim.common import app
from gajim.common.app import Any
from gajim.gtk.builder import get_builder
from gajim.gtk.const import Setting, SettingKind, SettingType
from gajim.gtk.settings import SettingsBox
from gajim.gtk.sidebar_switcher import SideBarSwitcher
from gajim.plugins.helpers import get_builder
from gajim.gtk.filechoosers import Filter
from gajim.gtk.settings import GajimPreferencesGroup, SettingsDialog
from gajim.plugins.plugins_i18n import _
from ..model_settings import *
from ..models import openai_whisper
from ..models import stt
from ..models.model_settings import OnnxAsrSettings
if TYPE_CHECKING:
from ..stt_voice_messages import STTVoiceMessagesPlugin
@@ -41,210 +42,250 @@ if TYPE_CHECKING:
log = logging.getLogger('gajim.p.sttvm_config_dialog')
SUPPORTED_MODELS: dict[str, dict[str, typing.Union[list[str], Any, str]]] = {
'model_openaiwhisper': {
'moduls': ['whisper'],
'class': openai_whisper.WhisperModel,
'name': 'OpenAI Whisper'
},
'model_ctranslate2': {
'moduls': ['ctranslate2'],
'class': None,
'name': _('CTranslate2')
},
'model_faster-whisper': {
'moduls': ['faster-whisper'],
'class': None,
'name:': _('Faster-Whisper')
},
'model_distill': {
'moduls': ['transformers', 'accelerate', 'datasets[audio]'],
'class': None,
'name': _('Distill')
}
}
class Configuration:
def __init__(self, plugin: STTVoiceMessagesPlugin):
self._plugin = plugin
self._openaiwhisper_settings = OpenAIWhisperSettings()
self._available_models: dict[
str, dict[str, typing.Union[list[str], Any, str]]] = {}
self.check_available_moduls()
self._instance = None
self._main_model_row = None
self._preset_model_picker = None
self._custom_model_id_entry = None
self._local_model_file_picker = None
self._status_group = None
self._model_data: dict[str, str] = {}
self._instance = stt.OnnxAsrModel()
self._instance.set_config(OnnxAsrSettings(
model_id=self.plugin.config['model_id'],
model_path=self.plugin.config['model_path']
))
self._model_data = self._steal_model_list()
@property
def plugin(self) -> STTVoiceMessagesPlugin:
return self._plugin
@property
def available_models(self) -> dict[str, dict[str, typing.Union[list[str], Any, str]]]:
return self._available_models
def is_available(self) -> bool:
return self._instance is not None
def unload_model(self) -> None:
if self._instance is not None:
self._instance.unload_now()
def _steal_model_list(self) -> dict[str, str]:
# UGLY: Extract available model choices from onnx_asr type hints.
ann = onnx_asr.load_model.__annotations__.get('model')
return {
v: v for arg in typing.get_args(ann)
for v in typing.get_args(arg)
if isinstance(v, str)
}
def on_setting(self, value: Any, data: Any) -> None:
if isinstance(value, str):
value.strip()
log.debug('plugin config before:\n %s', self.plugin.config.data)
value = value.strip()
self.plugin.config[data] = value
self._plugin.config['model_instance'].on_setting(data, value)
log.debug('plugin config after:\n %s', self.plugin.config.data)
def on_set_model(self, value: Any, data: Any) -> None:
if isinstance(value, str):
value.strip()
log.debug('plugin config before:\n %s', self.plugin.config.data)
def on_preset_changed(self, value: str, data: Any) -> None:
if self._custom_model_id_entry is not None:
entry_text = self._custom_model_id_entry.entry.get_text().strip()
if entry_text:
self._update_model_status()
return # custom entry overrides; ignore preset change
self._write_model_id(value)
self._update_model_status()
self._available_models[value]['model_instance'] = self._available_models[value]['class']()
def on_custom_model_id_changed(self, value: str, data: Any) -> None:
value = value.strip()
if value:
self._write_model_id(value)
elif self._preset_model_picker is not None:
preset_key = self._preset_model_picker._dropdown.get_selected_key()
if preset_key is not None:
self._write_model_id(preset_key)
self._apply_sensitivity_state()
self._update_model_status()
self.plugin.config['model_class'] = self._available_models[value][
'class']
self.plugin.config['model_instance'] = self._available_models[value]['model_instance']
def on_model_file_picked(self, value: str, data: Any) -> None:
self._write_model_path(str(Path(value).parent) if value else '')
self._apply_sensitivity_state()
self._update_model_status()
self.on_setting(value, data)
log.debug('plugin config after:\n %s', self.plugin.config.data)
def _write_model_id(self, model_id: str) -> None:
if self.plugin.config['model_id'] == model_id:
return
self.plugin.config['model_id'] = model_id
if self._instance is not None:
self._instance.set_config(OnnxAsrSettings(
model_id=self.plugin.config['model_id'],
model_path=self.plugin.config['model_path']
))
@staticmethod
def is_module_available(module: str) -> bool:
try:
__import__(module)
return True
except ModuleNotFoundError:
log.debug('Could not find module %s', module)
return False
except ImportError as ex:
log.debug(str(ex))
return False
def _write_model_path(self, model_path: str) -> None:
if self.plugin.config['model_path'] == model_path:
return
self.plugin.config['model_path'] = model_path
if self._instance is not None:
self._instance.set_config(OnnxAsrSettings(
model_id=self.plugin.config['model_id'],
model_path=self.plugin.config['model_path']
))
def check_available_moduls(self):
for model in SUPPORTED_MODELS:
available = True
for modul in SUPPORTED_MODELS[model]['moduls']:
if not self.is_module_available(modul):
available = False
continue
if available:
self._available_models[model] = SUPPORTED_MODELS[model]
def sync_model_path_from_widget(self) -> None:
if self._local_model_file_picker is None:
return
button = self._local_model_file_picker.get_activatable_widget()
path = button.get_path()
new_path = str(path.parent) if path else ''
self._write_model_path(new_path)
if (self.plugin.config.data['model_class'] is None
and len(self._available_models) > 0):
model = list(self._available_models)[0]
self.on_set_model(model, 'model')
log.debug('Choose first available model!')
def _apply_sensitivity_state(self) -> None:
if self._preset_model_picker is None:
return
has_local = bool(self.plugin.config['model_path'])
entry_text = (self._custom_model_id_entry.entry.get_text().strip()
if self._custom_model_id_entry else '')
has_entry = bool(entry_text)
self._custom_model_id_entry.set_sensitive(not has_local)
self._preset_model_picker.set_sensitive(not has_local and not has_entry)
def _update_model_status(self) -> None:
if self._main_model_row is None:
return
entry_text = (self._custom_model_id_entry.entry.get_text().strip()
if self._custom_model_id_entry else '')
if self.plugin.config['model_path']:
path = Path(self.plugin.config['model_path'])
summary = _('Local: {}').format(path.name or str(path))
description = _('Loading model files from {}').format(path)
if not (path / 'config.json').exists():
description += '\n' + _(
'config.json not found in this directory — onnx-asr will'
' fall back to Model preset or Custom Model ID for the'
' architecture.')
elif entry_text:
summary = _('Custom: {}').format(entry_text)
description = _('Using custom model: {}').format(entry_text)
else:
log.debug('Available model already chosen!')
preset_key = (self._preset_model_picker._dropdown.get_selected_key()
if self._preset_model_picker else '')
summary = preset_key or _('(none)')
description = (_('Using preset: {}').format(preset_key)
if preset_key else '')
log.debug('models = %s', self._available_models)
self._main_model_row._label.set_text(summary)
if self._status_group is not None:
self._status_group.set_description(description)
class PreferenceBox(SettingsBox):
def __init__(self, settings: list[Setting]) -> None:
SettingsBox.__init__(self, None)
self.get_style_context().add_class('border')
self.set_selection_mode(Gtk.SelectionMode.NONE)
self.set_vexpand(False)
self.set_valign(Gtk.Align.END)
for setting in settings:
self.add_setting(setting)
self.update_states()
class STTVoiceMessagesConfigDialog(Gtk.ApplicationWindow):
class STTVoiceMessagesConfigDialog(SettingsDialog):
def __init__(self, config: Configuration, parent: Gtk.Window) -> None:
Gtk.ApplicationWindow.__init__(self)
self.set_application(app.app)
self.set_position(Gtk.WindowPosition.CENTER)
self.set_show_menubar(False)
self.set_name('PreferencesWindow')
self.set_default_size(900, 650)
self.set_resizable(True)
self.set_title(_('STT Voice Messages - Preferences'))
ui_path = Path(__file__).parent
self._ui = get_builder(str(ui_path.resolve() / 'config_dialog.ui'))
self._prefs: dict[str, PreferenceBox] = {}
side_bar_switcher = SideBarSwitcher()
side_bar_switcher.set_stack(self._ui.stack)
self._ui.grid.attach(side_bar_switcher, 0, 0, 1, 1)
self.add(self._ui.grid)
self.config = config
self.plugin = self.config.plugin
if not config.is_available:
return
prefs: list[tuple[str, type[PreferenceBox]]] = [
('stt_behaviour', self.STTBehaviour),
('models', self.Models),
('whisper_general', self.OpenAIWhisperGeneral),
rows = [
Setting(SettingKind.SWITCH,
_('Auto Transcribe'),
SettingType.VALUE,
value=self.plugin.config['auto_transcribe'],
data='auto_transcribe',
callback=config.on_setting,
desc=_('Transcribe messages as they appear')),
Setting(SettingKind.SUBPAGE,
_('Model'),
SettingType.VALUE,
value=None,
name='main_model',
props={'subpage': 'sttvm-model'}),
]
self._add_prefs(prefs)
self.show_all()
SettingsDialog.__init__(
self,
parent,
_('STT Voice Messages'),
Gtk.DialogFlags.MODAL,
rows,
'',
)
class STTBehaviour(PreferenceBox):
def __init__(self, config_dialog: STTVoiceMessagesConfigDialog) -> None:
config._main_model_row = self.get_setting('main_model')
settings = [
Setting(SettingKind.SWITCH,
_('Auto Transcribe'),
SettingType.VALUE,
value=config_dialog.plugin.config['auto_transcribe'],
data='auto_transcribe',
callback=config_dialog.config.on_setting)
]
PreferenceBox.__init__(self, settings)
class Models(PreferenceBox):
def __init__(self, config_dialog: STTVoiceMessagesConfigDialog) -> None:
models: list[tuple[str, str]] = []
for key, value in config_dialog.config.available_models.items():
models.append(
(key, str(value['name']))
)
settings = [
Setting(SettingKind.COMBO,
_('Speech To Text Model'),
SettingType.VALUE,
value=config_dialog.plugin.config['model'],
data='model',
callback=config_dialog.config.on_set_model,
props={'combo_items': models},
desc=_('Choose Model to use')),
]
PreferenceBox.__init__(self, settings)
class OpenAIWhisperGeneral(PreferenceBox):
def __init__(self, config_dialog: STTVoiceMessagesConfigDialog) -> None:
settings = [
Setting(SettingKind.POPOVER,
_('Language Model Size'),
SettingType.VALUE,
value=config_dialog.plugin.config['whisperai_model_size'],
data='whisperai_model_size',
callback=config_dialog.config.on_setting,
props={'entries': whisper.available_models()}),
Setting(SettingKind.SWITCH,
_('Translate'),
SettingType.VALUE,
value=config_dialog.plugin.config['whisperai_translate'],
data='whisperai_translate',
callback=config_dialog.config.on_setting)
]
PreferenceBox.__init__(self, settings)
def _add_prefs(self, prefs: list[tuple[str, type[PreferenceBox]]]):
for ui_name, klass in prefs:
pref_box = getattr(self._ui, ui_name)
pref = klass(self) # pyright: ignore
pref_box.add(pref)
self._prefs[ui_name] = pref
use_custom = self.plugin.config['model_id'] not in config._model_data
subpage_rows: list[Setting] = [
Setting(SettingKind.DROPDOWN,
_('Model'),
SettingType.VALUE,
value=self.plugin.config['model_id'],
name='preset_model',
callback=config.on_preset_changed,
props={'data': config._model_data}),
Setting(SettingKind.ENTRY,
_('Custom Model'),
SettingType.VALUE,
value=self.plugin.config['model_id'] if use_custom else '',
name='custom_model',
callback=config.on_custom_model_id_changed,
desc=_('Custom HF model path or model ID')),
Setting(SettingKind.FILECHOOSER,
_('Local File'),
SettingType.VALUE,
value='',
name='local_model_file',
callback=config.on_model_file_picked,
desc=_('Model ID is taken from config.json if not set'),
props={'filefilters': [
Filter(_('ONNX model'), suffixes=['onnx'], default=True),
]}),
]
controls_group = GajimPreferencesGroup('model_controls')
for s in subpage_rows:
controls_group.add_setting(s)
status_group = Adw.PreferencesGroup()
pref_page = Adw.PreferencesPage()
pref_page.add(controls_group)
pref_page.add(status_group)
toolbar = Adw.ToolbarView(content=pref_page)
toolbar.add_top_bar(Adw.HeaderBar())
page = Adw.NavigationPage(
tag='sttvm-model', title=_('Model'), child=toolbar)
self._nav.add(page)
config._preset_model_picker = controls_group.get_setting('preset_model')
config._custom_model_id_entry = controls_group.get_setting('custom_model')
config._local_model_file_picker = controls_group.get_setting(
'local_model_file')
config._status_group = status_group
config._custom_model_id_entry.entry.set_placeholder_text(
_('onnx-community/whisper-large-v3-turbo'))
button = config._local_model_file_picker.get_activatable_widget()
button._label_text = _('.oonx')
button.reset()
if self.plugin.config['model_path']:
onnx_in_dir = next(iter(Path(self.plugin.config['model_path']).glob('*.onnx')),
None)
if onnx_in_dir is not None:
button.set_path(onnx_in_dir)
config._update_model_status()
config._apply_sensitivity_state()
def _cleanup(self) -> None:
self.config.sync_model_path_from_widget()
self.config._main_model_row = None
self.config._preset_model_picker = None
self.config._custom_model_id_entry = None
self.config._local_model_file_picker = None
self.config._status_group = None
SettingsDialog._cleanup(self)

View File

@@ -1,271 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!-- Generated with glade 3.40.0 -->
<interface>
<requires lib="gtk+" version="3.20"/>
<!-- n-columns=3 n-rows=3 -->
<object class="GtkGrid" id="grid">
<property name="visible">True</property>
<property name="can-focus">False</property>
<child>
<object class="GtkStack" id="stack">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="hexpand">True</property>
<child>
<object class="GtkScrolledWindow">
<property name="visible">True</property>
<property name="can-focus">True</property>
<property name="hscrollbar-policy">never</property>
<property name="shadow-type">in</property>
<property name="overlay-scrolling">False</property>
<child>
<object class="GtkViewport">
<property name="visible">True</property>
<property name="can-focus">False</property>
<child>
<object class="GtkBox">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="orientation">vertical</property>
<property name="spacing">24</property>
<child>
<!-- n-columns=1 n-rows=1 -->
<object class="GtkGrid" id="stt_behaviour">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="orientation">vertical</property>
<property name="row-spacing">12</property>
<child>
<object class="GtkLabel">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="label" translatable="yes">Behaviour of STT Voice Messages</property>
<property name="xalign">0</property>
<style>
<class name="bold"/>
</style>
</object>
<packing>
<property name="left-attach">0</property>
<property name="top-attach">0</property>
</packing>
</child>
</object>
<packing>
<property name="expand">False</property>
<property name="fill">True</property>
<property name="position">0</property>
</packing>
</child>
<child>
<!-- n-columns=1 n-rows=1 -->
<object class="GtkGrid" id="models">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="orientation">vertical</property>
<property name="row-spacing">12</property>
<child>
<object class="GtkLabel">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="label" translatable="yes">General Model Configuration</property>
<property name="xalign">0</property>
<style>
<class name="bold"/>
</style>
</object>
<packing>
<property name="left-attach">0</property>
<property name="top-attach">0</property>
</packing>
</child>
</object>
<packing>
<property name="expand">False</property>
<property name="fill">True</property>
<property name="position">1</property>
</packing>
</child>
<child>
<!-- n-columns=1 n-rows=1 -->
<object class="GtkGrid" id="file_preview">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="orientation">vertical</property>
<property name="row-spacing">12</property>
<child>
<object class="GtkLabel">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="label" translatable="yes">Preview UI</property>
<property name="xalign">0</property>
<style>
<class name="bold"/>
</style>
</object>
<packing>
<property name="left-attach">0</property>
<property name="top-attach">0</property>
</packing>
</child>
</object>
<packing>
<property name="expand">False</property>
<property name="fill">True</property>
<property name="position">2</property>
</packing>
</child>
</object>
</child>
</object>
</child>
</object>
<packing>
<property name="name">general</property>
<property name="title" translatable="yes">General</property>
<property name="icon-name">computer-symbolic</property>
</packing>
</child>
<child>
<object class="GtkScrolledWindow">
<property name="visible">True</property>
<property name="can-focus">True</property>
<property name="hscrollbar-policy">never</property>
<property name="shadow-type">in</property>
<property name="overlay-scrolling">False</property>
<child>
<object class="GtkViewport">
<property name="visible">True</property>
<property name="can-focus">False</property>
<child>
<object class="GtkBox">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="orientation">vertical</property>
<property name="spacing">24</property>
<child>
<!-- n-columns=3 n-rows=3 -->
<object class="GtkGrid" id="whisper_general">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="orientation">vertical</property>
<property name="row-spacing">12</property>
<child>
<object class="GtkLabel">
<property name="visible">True</property>
<property name="can-focus">False</property>
<property name="label" translatable="yes">General</property>
<property name="xalign">0</property>
<style>
<class name="bold"/>
</style>
</object>
<packing>
<property name="left-attach">0</property>
<property name="top-attach">0</property>
</packing>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
</object>
<packing>
<property name="expand">False</property>
<property name="fill">True</property>
<property name="position">0</property>
</packing>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
</object>
</child>
</object>
</child>
</object>
<packing>
<property name="name">openai-whisper</property>
<property name="title" translatable="yes">openAI Whisper</property>
<property name="position">1</property>
</packing>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<style>
<class name="settings-stack"/>
</style>
</object>
<packing>
<property name="left-attach">1</property>
<property name="top-attach">0</property>
</packing>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
<child>
<placeholder/>
</child>
</object>
</interface>

View File

@@ -13,9 +13,12 @@
# You should have received a copy of the GNU General Public License
# along with Gajim. If not, see <http://www.gnu.org/licenses/>.
import logging
from __future__ import annotations
from gi.repository import Gtk
import logging
from pathlib import Path
from gi.repository import Gtk, Adw
from gajim.plugins.gajimplugin import GajimPluginConfig
from gajim.plugins.plugins_i18n import _
@@ -26,48 +29,62 @@ log = logging.getLogger('gajim.p.stt_voice_messages_sttbox')
class STTBox(Gtk.Box):
def __init__(self,
preview_audio_widget: Gtk.Box,
config: GajimPluginConfig,
audio_file: str,
audio_file: Path,
) -> None:
Gtk.Box.__init__(self, orientation=Gtk.Orientation.VERTICAL, spacing=12)
Gtk.Box.__init__(self, orientation=Gtk.Orientation.HORIZONTAL, spacing=6)
self._config = config
self._preview_audio = preview_audio_widget
self._model = None
self._audio_file = audio_file
self._text = ''
self._transcribe_button = Gtk.Button(label=_('Transcribe'))
self._transcribe_button = Gtk.Button.new_from_icon_name("lucide-captions-symbolic")
self._transcribe_button.set_tooltip_text(_('Transcribe voice message'))
self._spinner = Adw.Spinner(valign=Gtk.Align.START, visible=False)
self._transcription_label = Gtk.Label(
label=_('Nothing transcribed yet'))
self._transcription_label.set_max_width_chars(40)
self._transcription_label.set_line_wrap(True)
self.add(self._transcribe_button)
self.add(self._transcription_label)
self._transcription_label.set_wrap(True)
self.append(self._spinner)
self.append(self._transcription_label)
self._transcribe_button.connect('clicked', self._on_transcribe_clicked)
self._result = helper.Results('')
self._transcribe_button.connect('clicked', self._on_transcribe_clicked)
self.show_all()
@property
def button(self) -> Gtk.Button:
return self._transcribe_button
def _on_transcribe_clicked(self, _button: Gtk.Button) -> None:
log.debug('config.data = %s', self._config.data)
model = self._config.data['model_instance']
if model is None:
log.debug('config._instance = %s', self._config._instance)
self._model = self._config._instance
if self._model is None:
return
self._model = model
if self._model.is_loaded:
text = _('Transcribing…')
elif self._model.will_download:
text = _('Downloading ') + self._model.model_id
else:
text = _('Loading model…')
self._transcription_label.set_text(text)
self._spinner.set_visible(True)
self._task = helper.BackgroundTask(
self._model.load, self._on_load_done)
self._task.start()
transcription_task = helper.BackgroundTask(
self._model.transcribe(self._result, self._audio_file),
self._show_result
def _on_load_done(self):
self._transcription_label.set_text(_('Transcribing…'))
self._task = helper.BackgroundTask(
lambda: self._model.recognize(
self._result, helper.load_audio(self._audio_file)),
self._show_result,
)
transcription_task.start()
self._task.start()
def _show_result(self):
assert self._model is not None
@@ -76,3 +93,4 @@ class STTBox(Gtk.Box):
self._transcription_label.set_text(self._text.strip())
else:
self._transcription_label.set_text(_('_Have not heard any word!_'))
self._spinner.set_visible(False)

View File

@@ -13,16 +13,53 @@
# You should have received a copy of the GNU General Public License
# along with Gajim. If not, see <http://www.gnu.org/licenses/>.
import logging
import typing
from dataclasses import dataclass
from pathlib import Path
import gi
import numpy as np
from gi.repository import Gio, GObject
try:
gi.require_version('Gst', '1.0')
from gi.repository import Gst
except Exception:
if typing.TYPE_CHECKING:
from gi.repository import Gst
log = logging.getLogger('gajim.p.sttvm_helper')
@dataclass
class Results:
text: str
def load_audio(path: Path, sample_rate: int = 16000) -> np.ndarray:
Gst.init(None)
pipeline = Gst.parse_launch(
'filesrc name=src ! decodebin ! audioconvert ! audioresample ! '
f'audio/x-raw,format=F32LE,rate={sample_rate},channels=1 ! '
'appsink name=sink sync=false'
)
pipeline.get_by_name('src').set_property('location', str(path))
sink = pipeline.get_by_name('sink')
chunks: list[np.ndarray] = []
pipeline.set_state(Gst.State.PLAYING)
while (sample := sink.emit('try-pull-sample', 10 * Gst.SECOND)) is not None:
buf = sample.get_buffer()
_, info = buf.map(Gst.MapFlags.READ)
chunks.append(np.frombuffer(bytes(info.data), dtype=np.float32))
buf.unmap(info)
pipeline.set_state(Gst.State.NULL)
if not chunks:
raise RuntimeError(f'Could not decode audio: {path}')
return np.concatenate(chunks)
'''
https://discourse.gnome.org/t/gtk-threading-problem-with-glib-idle-add/13597/5
@@ -57,6 +94,7 @@ class BackgroundTask(GObject.Object):
retval = self.function()
task.return_value(retval)
except Exception as e:
log.exception('Background task failed')
task.return_value(e)
def finish(self):

View File

@@ -18,7 +18,7 @@ from pathlib import Path
from gajim.gtk.const import Setting
from .model import Model
from .model_template import Model
try:
import ctranslate2

View File

@@ -0,0 +1,60 @@
# This file is part of Gajim.
#
# Gajim is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Gajim is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Gajim. If not, see <http://www.gnu.org/licenses/>.
import logging
import typing
from dataclasses import dataclass
from pathlib import Path
from ..helper import Results
from .model_settings import FasterWhisperSettings
from .model_template import Model
log = logging.getLogger('gajim.p.sttvm_faster_whisper')
try:
import faster_whisper
except ModuleNotFoundError:
if typing.TYPE_CHECKING:
import faster_whisper
@dataclass
class Configuration:
model_size: str
class FasterWhisperModel(Model):
def __init__(self):
self._result: str = ''
self._config = FasterWhisperSettings()
@property
def result(self) -> str:
return self._result
def transcribe(self, result: Results, audio_file: Path) -> None:
model = faster_whisper.WhisperModel(self._config.model_size, compute_type="float32")
log.debug('model size is used = %s', self._config.model_size)
segments, _ = model.transcribe(audio_file)
segments = list(segments)
result.text = ''
log.debug('segments = %s', segments)
for segment in segments:
result.text += segment.text
print("[%.2fs -> %.2fs] %s" % (
segment.start, segment.end, segment.text))
def set_config(self, config: FasterWhisperSettings) -> None:
self._config = config

View File

@@ -18,6 +18,6 @@ from dataclasses import dataclass, field
@dataclass
class OpenAIWhisperSettings:
whisperai_model_size: str = field(default='tiny', init=True)
class OnnxAsrSettings:
model_id: str = field(default='nemo-parakeet-tdt-0.6b-v3', init=True)
model_path: str = ''

View File

@@ -14,19 +14,28 @@
# along with Gajim. If not, see <http://www.gnu.org/licenses/>.
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any
from gajim.gtk.const import Setting
import numpy as np
from ..helper import Results
class Model(ABC):
@property
@abstractmethod
def transcribe(self, result: Results, audio_file: Path) -> str:
return ''
def is_loaded(self) -> bool:
pass
@abstractmethod
def on_setting(self, setting: Setting):
pass
def load(self) -> None:
pass
@abstractmethod
def recognize(self, result: Results, audio: np.ndarray) -> None:
pass
@abstractmethod
def set_config(self, config: Any) -> None:
pass

View File

@@ -19,8 +19,8 @@ from dataclasses import dataclass
from pathlib import Path
from ..helper import Results
from ..model_settings import OpenAIWhisperSettings
from .model import Model
from .model_settings import OpenAIWhisperSettings
from .model_template import Model
log = logging.getLogger('gajim.p.sttvm_whisper')
@@ -37,7 +37,6 @@ class Configuration:
class WhisperModel(Model):
def __init__(self):
# TODO
self._result: str = ''
self._config = OpenAIWhisperSettings()
@@ -45,12 +44,11 @@ class WhisperModel(Model):
def result(self) -> str:
return self._result
def transcribe(self, result: Results, audio_file: Path) -> str:
model = whisper.load_model(self._config['whisperai_model_size'])
log.debug('model size is used = %s', self._config['whisperai_model_size'])
result.text = model.transcribe(audio_file)['text']
def transcribe(self, result: Results, audio_file: Path) -> None:
model = whisper.load_model(self._config.model_size)
log.debug('model size is used = %s', self._config.model_size)
result.text = model.transcribe(audio_file)['text'] # pyright: ignore [reportAttributeAccessIssue]
def on_setting(self, key, value):
log.debug('key = %s, value = %s', key, value)
self._config[key] = value
def set_config(self, config: OpenAIWhisperSettings) -> None:
self._config = config

View File

@@ -0,0 +1,132 @@
# This file is part of Gajim.
#
# Gajim is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Gajim is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Gajim. If not, see <http://www.gnu.org/licenses/>.
import logging
import pickle
import subprocess
import sys
from pathlib import Path
import numpy as np
from gi.repository import GLib
from ..helper import Results
from .model_settings import OnnxAsrSettings
from .model_template import Model
log = logging.getLogger('gajim.p.sttvm_onnx_asr')
_IDLE_UNLOAD_SECONDS = 300
class OnnxAsrModel(Model):
def __init__(self):
self._proc = None
self._loaded = False
self._config = OnnxAsrSettings()
self._unload_source = None
@property
def is_loaded(self) -> bool:
return self._loaded
@property
def will_download(self) -> bool:
if self.is_loaded or self._config.model_path:
return False
from huggingface_hub import try_to_load_from_cache
from onnx_asr.resolver import model_repos
repo = model_repos.get(self._config.model_id, self._config.model_id)
if '/' not in repo:
return False
return not isinstance(try_to_load_from_cache(repo, 'config.json'), str)
def load(self) -> None:
if self._loaded:
self._reschedule_unload()
return
log.debug('Loading model %s in worker', self._config.model_id)
self._send({
'op': 'load',
'model_id': self._config.model_id,
'model_path': self._config.model_path,
})
self._loaded = True
self._reschedule_unload()
def recognize(self, result: Results, audio: np.ndarray) -> None:
self.load()
response = self._send({'op': 'recognize', 'audio': audio})
result.text = response['text']
self._reschedule_unload()
def set_config(self, config: OnnxAsrSettings) -> None:
if (config.model_id != self._config.model_id
or config.model_path != self._config.model_path):
self.unload_now()
self._config = OnnxAsrSettings(
model_id=config.model_id, model_path=config.model_path)
def unload_now(self) -> None:
if self._unload_source is not None:
GLib.source_remove(self._unload_source)
self._unload_source = None
if self._proc is not None:
log.debug('Terminating STT worker subprocess')
try:
self._proc.stdin.close()
self._proc.wait(timeout=2)
except subprocess.TimeoutExpired:
self._proc.kill()
self._proc.wait()
self._proc = None
self._loaded = False
def _ensure_proc(self) -> None:
if self._proc is not None and self._proc.poll() is None:
return
log.debug('Starting STT worker subprocess')
self._proc = subprocess.Popen(
[sys.executable, str(Path(__file__).parent / 'stt_worker.py')],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
)
self._loaded = False
def _send(self, cmd: dict) -> dict:
self._ensure_proc()
pickle.dump(cmd, self._proc.stdin)
self._proc.stdin.flush()
try:
response = pickle.load(self._proc.stdout)
except EOFError as e:
self._proc = None
self._loaded = False
raise RuntimeError('Worker subprocess exited unexpectedly') from e
if not response.get('ok'):
raise RuntimeError(response.get('error', 'unknown worker error'))
return response
def _reschedule_unload(self) -> None:
if self._unload_source is not None:
GLib.source_remove(self._unload_source)
self._unload_source = GLib.timeout_add_seconds(
_IDLE_UNLOAD_SECONDS, self._on_idle_unload)
def _on_idle_unload(self) -> bool:
self._unload_source = None
log.debug('Idle unload after %ds', _IDLE_UNLOAD_SECONDS)
self.unload_now()
return GLib.SOURCE_REMOVE

View File

@@ -0,0 +1,54 @@
# This file is part of Gajim.
#
# Gajim is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Gajim is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Gajim. If not, see <http://www.gnu.org/licenses/>.
import pickle
import sys
import traceback
def _respond(response: dict) -> None:
pickle.dump(response, sys.stdout.buffer)
sys.stdout.buffer.flush()
def main() -> None:
model = None
while True:
try:
cmd = pickle.load(sys.stdin.buffer)
except EOFError:
return
try:
op = cmd['op']
if op == 'load':
import onnx_asr
model = onnx_asr.load_model(
cmd['model_id'], cmd.get('model_path') or None)
_respond({'ok': True})
elif op == 'recognize':
text = model.recognize(cmd['audio'])
_respond({'ok': True, 'text': text})
else:
_respond({'ok': False, 'error': f'unknown op: {op}'})
except Exception as e:
_respond({
'ok': False,
'error': f'{type(e).__name__}: {e}',
'traceback': traceback.format_exc(),
})
if __name__ == '__main__':
main()

View File

@@ -13,7 +13,7 @@
"win32"
],
"requirements": [
"gajim>=1.9.0"
"gajim>=2.0.0"
],
"short_name": "stt_voice_messages",
"version": "0.0.1"

View File

@@ -15,51 +15,89 @@
from __future__ import annotations
import logging
from functools import partial
from pathlib import Path
from gi.repository import GLib, Gtk
from gajim.common import app
from gajim.plugins import GajimPlugin
from gajim.plugins.plugins_i18n import _
from .gtk.config_dialog import *
from .gtk.config_dialog import Configuration, STTVoiceMessagesConfigDialog
from .gtk.sttbox import STTBox
from .models.model_settings import OnnxAsrSettings
log = logging.getLogger('gajim.p.stt_voice_messages')
_FOCUS_LOSS_UNLOAD_SECONDS = 30
class STTVoiceMessagesPlugin(GajimPlugin):
def init(self) -> None:
self.description = _('Transcribes voice messages to text.')
self.config_default_values = {
'auto_transcribe': (False, ''),
'model_id': ('nemo-parakeet-tdt-0.6b-v3', ''),
'model_path': ('', ''),
}
self._config = Configuration(self)
self._config.check_available_moduls()
self.config_dialog = partial(STTVoiceMessagesConfigDialog, self._config)
self.gui_extension_points = {
'preview_audio': (self._on_preview_audio_created, None),
}
self.config_default_values = {
'auto_transcribe': (False, ''),
'model': ('', ''),
'model_class': (None, ''),
'whisperai_model_size': ('tiny', ''),
'whisperai_translate': (False, ''),
}
self._active_handler_id = 0
self._focus_unload_source = None
self._audio_file: str = ''
self._preview_audio_widget = None
self._stt_box = None
def activate(self) -> None:
if app.window is not None and self._active_handler_id == 0:
self._active_handler_id = app.window.connect(
'notify::is-active', self._on_window_active_changed)
def deactivate(self) -> None:
if self._focus_unload_source is not None:
GLib.source_remove(self._focus_unload_source)
self._focus_unload_source = None
if self._active_handler_id != 0 and app.window is not None:
app.window.disconnect(self._active_handler_id)
self._active_handler_id = 0
if self._config.is_available:
self._config.unload_model()
def _on_window_active_changed(self,
window: Gtk.Window,
_pspec: object,
) -> None:
if window.is_active():
if self._focus_unload_source is not None:
GLib.source_remove(self._focus_unload_source)
self._focus_unload_source = None
elif self._focus_unload_source is None:
self._focus_unload_source = GLib.timeout_add_seconds(
_FOCUS_LOSS_UNLOAD_SECONDS, self._on_focus_unload_fired)
def _on_focus_unload_fired(self) -> bool:
self._focus_unload_source = None
if self._config.is_available:
self._config.unload_model()
return GLib.SOURCE_REMOVE
def _on_preview_audio_created(self,
preview_audio_widget: Gtk.Box,
drawing_box: Gtk.Box,
control_box: Gtk.Box,
audio_file: Path
) -> None:
self._preview_audio_widget = preview_audio_widget
self._drawing_box = drawing_box;
self._control_box = control_box;
self._audio_file = audio_file.as_posix()
self._create_stt_box()
def _create_stt_box(self) -> None:
assert self._preview_audio_widget is not None
self._stt_box = STTBox(self._preview_audio_widget,
self.config,
self._audio_file)
self._preview_audio_widget.pack_end(self._stt_box, False, False, 0)
self._stt_box = STTBox(self._config, self._audio_file)
self._control_box.append(self._stt_box.button)
self._drawing_box.append(self._stt_box)