From 194300a4d805bdcab6d467b352f0f90a3cc247a1 Mon Sep 17 00:00:00 2001 From: mesonium Date: Tue, 11 Jun 2024 18:30:45 +0200 Subject: [PATCH] Add STT Voice Messages Plugin --- stt_voice_messages/README.md | 28 +++++++ stt_voice_messages/__init__.py | 1 + stt_voice_messages/config_dialog.py | 63 ++++++++++++++ stt_voice_messages/plugin-manifest.json | 20 +++++ stt_voice_messages/stt_voice_message.png | Bin 0 -> 371 bytes stt_voice_messages/stt_voice_messages.py | 101 +++++++++++++++++++++++ 6 files changed, 213 insertions(+) create mode 100644 stt_voice_messages/README.md create mode 100644 stt_voice_messages/__init__.py create mode 100644 stt_voice_messages/config_dialog.py create mode 100644 stt_voice_messages/plugin-manifest.json create mode 100644 stt_voice_messages/stt_voice_message.png create mode 100644 stt_voice_messages/stt_voice_messages.py diff --git a/stt_voice_messages/README.md b/stt_voice_messages/README.md new file mode 100644 index 0000000..dcb205e --- /dev/null +++ b/stt_voice_messages/README.md @@ -0,0 +1,28 @@ +# Requirements + +## STT Models + +### openai-whisper https://github.com/openai/whisper + +#### Installation +`pip install -U openai-whisper` will install + +``` +mpmath, urllib3, tqdm, sympy, regex, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, +nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, +nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, networkx, +MarkupSafe, llvmlite, fsspec, filelock, charset-normalizer, certifi, triton, +requests, nvidia-cusparse-cu12, nvidia-cudnn-cu12, numba, jinja2, tiktoken, +nvidia-cusolver-cu12, torch, openai-whisper +``` + +#### Models + +| Multi Langual Model | Download Size | VRAM Requirement | Relative Speed | +|---------------------|---------------| ---------------- |----------------| +| Tiny | 70 MB | ~1 GB | ~32x | +| Base | 140 MB | ~1 GB | ~16x | +| Small | 460 MB | ~2 GB | ~6x | +| Medium | 1.4 GB | ~5 GB | ~2x | +| Large | 2.9 GB | ~10 GB | ~1x | + diff --git a/stt_voice_messages/__init__.py b/stt_voice_messages/__init__.py new file mode 100644 index 0000000..ce9a892 --- /dev/null +++ b/stt_voice_messages/__init__.py @@ -0,0 +1 @@ +from .stt_voice_messages import STTVoiceMessagesPlugin # type: ignore # noqa: F401 diff --git a/stt_voice_messages/config_dialog.py b/stt_voice_messages/config_dialog.py new file mode 100644 index 0000000..14eef6b --- /dev/null +++ b/stt_voice_messages/config_dialog.py @@ -0,0 +1,63 @@ +# This file is part of Gajim. +# +# Gajim is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Gajim is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Gajim. If not, see . + +from __future__ import annotations + +from typing import Any +from typing import TYPE_CHECKING + +from gi.repository import Gtk + +from gajim.plugins.plugins_i18n import _ + +from gajim.gtk.settings import SettingsDialog +from gajim.gtk.const import Setting +from gajim.gtk.const import SettingKind +from gajim.gtk.const import SettingType + +if TYPE_CHECKING: + from .stt_voice_messages import STTVoiceMessagesPlugin + + +class STTVoiceMessagesConfigDialog(SettingsDialog): + def __init__(self, plugin: STTVoiceMessagesPlugin, parent: Gtk.Window) -> None: + + type_values = ('tiny', 'base', 'small', 'medium', 'large') + + self.plugin = plugin + settings = [ + Setting(SettingKind.COMBO, + _('Language Model'), + SettingType.VALUE, + callback=self._on_setting, + props={'combo_items': type_values}), + + Setting(SettingKind.SWITCH, + _('Use Multilanguage Model'), + SettingType.VALUE, + self.plugin.config['use_multilanguage_model'], + callback=self._on_setting, + data='use_multilanguage_model'), + ] + + SettingsDialog.__init__(self, + parent, + _('STT Voice Message Configuration'), + Gtk.DialogFlags.MODAL, + settings, + '') + + def _on_setting(self, value: Any, data: Any) -> None: + self.plugin.config[data] = value diff --git a/stt_voice_messages/plugin-manifest.json b/stt_voice_messages/plugin-manifest.json new file mode 100644 index 0000000..6c688a8 --- /dev/null +++ b/stt_voice_messages/plugin-manifest.json @@ -0,0 +1,20 @@ +{ + "authors": [ + "mesonium " + ], + "description": "Transcribes voice messages to text.", + "homepage": "https://dev.gajim.org/gajim/gajim-plugins/wikis/STTVoiceMessagesPlugin", + "config_dialog": true, + "name": "STT Voice Messages", + "platforms": [ + "others", + "linux", + "darwin", + "win32" + ], + "requirements": [ + "gajim>=1.4.0" + ], + "short_name": "stt_voice_messages", + "version": "0.0.1" +} diff --git a/stt_voice_messages/stt_voice_message.png b/stt_voice_messages/stt_voice_message.png new file mode 100644 index 0000000000000000000000000000000000000000..f7cea2fa10d4b7b7b8324757f9483aaf0212d3b3 GIT binary patch literal 371 zcmV-(0gV2MP)ftCGW@rMlzaWr`*?P1kkk`<_0} z^Lz}z0hVDHu&(Q#K$7da20;+c0n;==*LBb|4c(KH$8n^dK*{%0Kvh+c=Q(`N2SNx~ zmc0fbrx!)Bzat<5;5hCYplw@v+NWh%2C^)l1MnsJX`24)f+&ihC<;yfc8TcdUvM0U zwpEfO;CUV}99-8$mj?+4+qeUMS(d;s%-2CW9c9n+Y$7d55-RbQge&$EU;q`ytQB#? Ri8lZM002ovPDHLkV1lR+lr8`O literal 0 HcmV?d00001 diff --git a/stt_voice_messages/stt_voice_messages.py b/stt_voice_messages/stt_voice_messages.py new file mode 100644 index 0000000..bbc8c68 --- /dev/null +++ b/stt_voice_messages/stt_voice_messages.py @@ -0,0 +1,101 @@ +# This file is part of Gajim. +# +# Gajim is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Gajim is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Gajim. If not, see . + +from __future__ import annotations + +import logging +from functools import partial +from pathlib import Path + +import whisper +from gi.repository import Gtk +from stt_voice_messages.config_dialog import STTVoiceMessagesConfigDialog + +from gajim.plugins import GajimPlugin +from gajim.plugins.plugins_i18n import _ + +log = logging.getLogger('gajim.p.stt_voice_messages') + + +class STTVoiceMessagesPlugin(GajimPlugin): + def init(self) -> None: + self.description = _('Transcribes voice messages to text.') + self.config_dialog = partial(STTVoiceMessagesConfigDialog, self) + + self.gui_extension_points = { + 'preview_audio': (self._preview_audio_created, None), + } + + self.config_default_values = { + 'use_multilanguage_model': (True, ''), + 'model_size': ('tiny', '') + } + + self._audio_file = None + self._preview_audio_widget = None + self._stt_box = None + + def _preview_audio_created(self, + preview_audio_widget: Gtk.Box, + audio_file: Path + ) -> None: + self._preview_audio_widget = preview_audio_widget + self._audio_file = audio_file.as_posix() + self._create_stt_box() + + def _create_stt_box(self) -> None: + assert self._preview_audio_widget is not None + self._stt_box = STTBox(self._preview_audio_widget, + self.config, + self._audio_file) + self._preview_audio_widget.pack_end(self._stt_box, False, False, 0) + +class STTBox(Gtk.Box): + def __init__(self, + preview_audio_widget: Gtk.Box, + config: GajimPluginConfig, + audio_file: Path, + ) -> None: + + print('FOO') + Gtk.Box.__init__(self, orientation=Gtk.Orientation.VERTICAL, spacing=12) + + self._config = config + self._preview_audio = preview_audio_widget + self._audio_file = audio_file + + self._transcribe_button = Gtk.Button(label=_('Transcribe')) + self._transcription_label = Gtk.Label(label='Fooo') + self.add(self._transcribe_button) + self.add(self._transcription_label) + + self._transcribe_button.connect('clicked', self._on_transcribe_clicked) + + self.show_all() + + + def _on_transcribe_clicked(self, _button: Gtk.Button): + text = self._trascribe_by_whisper() + self._transcription_label.set_text(text) + + def _trascribe_by_whisper(self) -> str: + model = whisper.load_model(self._config['model_size']) + result = model.transcribe(self._audio_file) + text = result["text"] + return text + + + +