Make Faster Whisper and OpenAI Whisper work

2024-07-03 18:43:27 +02:00
parent b0adecef7a
commit 2d7630a757
7 changed files with 261 additions and 51 deletions
@@ -15,13 +15,23 @@
 from __future__ import annotations
 from dataclasses import asdict
 import logging
 import typing
 from pathlib import Path
 from typing import TYPE_CHECKING
-import whisper
+try:
    import whisper
 except ModuleNotFoundError:
    if typing.TYPE_CHECKING:
        import whisper
 try:
    import faster_whisper as fwhisper
 except ModuleNotFoundError:
    if typing.TYPE_CHECKING:
        import faster_whisper as fwhisper
 from gi.repository import Gtk
 from gajim.common import app
@@ -33,7 +43,7 @@ from gajim.gtk.sidebar_switcher import SideBarSwitcher
 from gajim.plugins.helpers import get_builder
 from gajim.plugins.plugins_i18n import _
-from ..models import openai_whisper
+from ..models import faster_whisper, openai_whisper
 from ..models.model_settings import *
 if TYPE_CHECKING:
@@ -56,18 +66,10 @@ SUPPORTED_MODELS: dict[str, Model] = {
                                 ['whisper'],
                                 openai_whisper.WhisperModel,
                                 OpenAIWhisperSettings),
-    'model_ctranslate2': Model('CTranslate2',
+    'model_faster-whisper': Model('Faster-Whisper',
-                               ['ctranslate2'],
+                                  ['faster_whisper'],
-                               None,
+                                  faster_whisper.FasterWhisperModel,
-                               None),
+                                  FasterWhisperSettings)
    'model_faster-whisper': Model('Fast-Whisper',
                                  ['faster-whisper'],
                                  None,
                                  None),
    'model_distill': Model('Distill',
                           ['transformers', 'accelerate', 'datasets[audio]'],
                           None,
                           None)
 }
@@ -78,7 +80,7 @@ class Configuration:
        self._available_models: dict[str, Model] = {}
        self.check_available_moduls()
-        log.debug('config = %s', self._plugin.config['model_openaiwhisper'])
+        log.debug('config = %s', self._plugin.config)
    @property
    def plugin(self) -> STTVoiceMessagesPlugin:
@@ -106,20 +108,20 @@ class Configuration:
        self._plugin.config.data[model].instance.set_config(self.plugin.config.data[model])
-    def on_set_model(self, model: Any) -> None:
+    def create_model(self, model: Any) -> None:
        if isinstance(model, str):
            model.strip()
        log.debug('plugin config before:\n %s', self.plugin.config.data)
        if (self.plugin.config.data[model].instance is None and
                self._available_models[model].klass is not None):
            self.plugin.config.data[model].instance = \
                self._available_models[model].klass()
        else:
-            return
+            log.debug('Could not create model %s', model)
    def on_set_model(self, model: Any, data: str = 'model') -> None:
        if isinstance(model, str):
            model.strip()
        self.plugin.config['model'] = model
-        log.debug('plugin config after:\n %s', self.plugin.config.data)
+        log.debug('Created model %s with config %s', model, self.plugin.config.data[model])
    def check_available_moduls(self):
        def is_module_available(module: str) -> bool:
@@ -146,6 +148,7 @@ class Configuration:
                    log.debug('plugin config for model = %s', self.plugin.config[model])
                    self.plugin.config.data[model].instance = None
                    self._available_models[model].config = self.plugin.config[model]
                    self.create_model(model)
        self.on_set_model(self._plugin.config['model'])
@@ -192,12 +195,39 @@ class STTVoiceMessagesConfigDialog(Gtk.ApplicationWindow):
        prefs: list[tuple[str, type[PreferenceBox]]] = [
            ('stt_behaviour', self.STTBehaviour),
            ('models', self.Models),
            ('whisper_general', self.OpenAIWhisperGeneral),
        ]
        self._add_prefs(prefs)
        # TODO: Refactor this
        if 'model_openaiwhisper' in config.available_models:
            prefs.append(('openaiwhisper_general', self.OpenAIWhisperGeneral))
        else:
            self._disable_pref('openai-whisper-viewport') # does not work yet
        if 'model_faster-whisper' in config.available_models:
            prefs.append(('fasterwhisper_general', self.FasterWhisperGeneral))
        else:
            self._disable_pref('faster-whisper') # does not work yet
        self._add_prefs(prefs)
        self.show_all()
    def _add_prefs(self, prefs: list[tuple[str, type[PreferenceBox]]]):
        for ui_name, klass in prefs:
            pref_box = getattr(self._ui, ui_name)
            pref = klass(self)  # pyright: ignore
            log.debug('ui_name = %s, klass = %s, pref_box = %s', ui_name, klass, pref_box)
            pref_box.add(pref)
            self._prefs[ui_name] = pref
    def _disable_pref(self, pref: str):
        # TODO: Not scrolling to setting does not work!
        pref_box = getattr(self._ui, pref)
        log.debug('Disable Settings Page for %s', pref_box)
        adj = Gtk.Adjustment(0, 0, 0)
        pref_box.set_focus_hadjustment(adj)
        pref_box.set_focus_vadjustment(adj)
    ############################################################################
    # General Settings
    ############################################################################
@@ -266,9 +296,36 @@ class STTVoiceMessagesConfigDialog(Gtk.ApplicationWindow):
        def _set_config(self, value: Any, data: Any):
            self._config_dialog.config.on_config_model(self._model, value, data)
-    def _add_prefs(self, prefs: list[tuple[str, type[PreferenceBox]]]):
+    ############################################################################
-        for ui_name, klass in prefs:
+    # Faster Whisper Settings
-            pref_box = getattr(self._ui, ui_name)
+    ############################################################################
-            pref = klass(self)  # pyright: ignore
+    class FasterWhisperGeneral(PreferenceBox):
-            pref_box.add(pref)
+        def __init__(self,
-            self._prefs[ui_name] = pref
+                     config_dialog: STTVoiceMessagesConfigDialog) -> None:
            self._model = 'model_faster-whisper'
            self._config_dialog = config_dialog
            settings = [
                Setting(SettingKind.POPOVER,
                        _('Language Model Size'),
                        SettingType.VALUE,
                        value=config_dialog.config.available_models[
                            self._model].config.model_size,
                        data='model_size',
                        callback=self._set_config,
                        props={'entries': fwhisper.available_models()}),
                Setting(SettingKind.SWITCH,
                        _('Translate'),
                        SettingType.VALUE,
                        value=config_dialog.config.available_models[
                            self._model].config.translate_to_english,
                        data='translate_to_english',
                        callback=self._set_config)
            ]
            PreferenceBox.__init__(self, settings)
        def _set_config(self, value: Any, data: Any):
            self._config_dialog.config.on_config_model(self._model, value,
                                                       data)
@@ -127,14 +127,15 @@
          </packing>
        </child>
        <child>
-          <object class="GtkScrolledWindow">
+          <object class="GtkScrolledWindow" id="openai-whisper">
            <property name="name">openai-whisper</property>
            <property name="visible">True</property>
            <property name="can-focus">True</property>
            <property name="hscrollbar-policy">never</property>
            <property name="shadow-type">in</property>
            <property name="overlay-scrolling">False</property>
            <child>
-              <object class="GtkViewport">
+              <object class="GtkViewport" id="openai-whisper-viewport">
                <property name="visible">True</property>
                <property name="can-focus">False</property>
                <child>
@@ -145,7 +146,7 @@
                    <property name="spacing">24</property>
                    <child>
                      <!-- n-columns=3 n-rows=3 -->
-                      <object class="GtkGrid" id="whisper_general">
+                      <object class="GtkGrid" id="openaiwhisper_general">
                        <property name="visible">True</property>
                        <property name="can-focus">False</property>
                        <property name="orientation">vertical</property>
@@ -219,6 +220,54 @@
            <property name="position">1</property>
          </packing>
        </child>
        <child>
          <object class="GtkScrolledWindow" id="faster-whisper">
            <property name="visible">True</property>
            <property name="can-focus">True</property>
            <property name="hscrollbar-policy">never</property>
            <property name="shadow-type">in</property>
            <property name="overlay-scrolling">False</property>
            <child>
              <object class="GtkViewport">
                <property name="visible">True</property>
                <property name="can-focus">False</property>
                <child>
                  <object class="GtkBox">
                    <property name="visible">True</property>
                    <property name="can-focus">False</property>
                    <property name="orientation">vertical</property>
                    <property name="spacing">24</property>
                    <child>
                      <!-- n-columns=3 n-rows=3 -->
                      <object class="GtkGrid" id="fasterwhisper_general">
                        <property name="visible">True</property>
                        <property name="can-focus">False</property>
                        <property name="orientation">vertical</property>
                        <property name="row-spacing">12</property>
                        <child>
                          <object class="GtkLabel">
                            <property name="visible">True</property>
                            <property name="can-focus">False</property>
                            <property name="label" translatable="yes">General</property>
                            <property name="xalign">0</property>
                            <style>
                              <class name="bold"/>
                            </style>
                          </object>
                          <packing>
                            <property name="left-attach">0</property>
                            <property name="top-attach">0</property>
                          </packing>
                        </child>
                        <child>
                          <placeholder/>
                        </child>
                        <child>
                          <placeholder/>
                        </child>
                        <child>
                          <placeholder/>
                        </child>
                        <child>
                          <placeholder/>
                        </child>
@@ -234,6 +283,36 @@
                        <child>
                          <placeholder/>
                        </child>
                      </object>
                      <packing>
                        <property name="expand">False</property>
                        <property name="fill">True</property>
                        <property name="position">0</property>
                      </packing>
                    </child>
                    <child>
                      <placeholder/>
                    </child>
                    <child>
                      <placeholder/>
                    </child>
                    <child>
                      <placeholder/>
                    </child>
                    <child>
                      <placeholder/>
                    </child>
                  </object>
                </child>
              </object>
            </child>
          </object>
          <packing>
            <property name="name">faster-whisper</property>
            <property name="title" translatable="yes">Faster Whisper</property>
            <property name="position">2</property>
          </packing>
        </child>
        <style>
          <class name="settings-stack"/>
        </style>
@@ -0,0 +1,60 @@
 # This file is part of Gajim.
 #
 # Gajim is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # Gajim is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Gajim. If not, see <http://www.gnu.org/licenses/>.
 import logging
 import typing
 from dataclasses import dataclass
 from pathlib import Path
 from ..helper import Results
 from .model_settings import FasterWhisperSettings
 from .model_template import Model
 log = logging.getLogger('gajim.p.sttvm_faster_whisper')
 try:
    import faster_whisper
 except ModuleNotFoundError:
    if typing.TYPE_CHECKING:
        import faster_whisper
@dataclass
 class Configuration:
    model_size: str
 class FasterWhisperModel(Model):
    def __init__(self):
        self._result: str = ''
        self._config = FasterWhisperSettings()
    @property
    def result(self) -> str:
        return self._result
    def transcribe(self, result: Results, audio_file: Path) -> None:
        model = faster_whisper.WhisperModel(self._config.model_size, compute_type="float32")
        log.debug('model size is used = %s', self._config.model_size)
        segments, _ = model.transcribe(audio_file)
        segments = list(segments)
        result.text = ''
        log.debug('segments = %s', segments)
        for segment in segments:
            result.text += segment.text
            print("[%.2fs -> %.2fs] %s" % (
            segment.start, segment.end, segment.text))
    def set_config(self, config: FasterWhisperSettings) -> None:
        self._config = config
@@ -21,3 +21,8 @@ from dataclasses import dataclass, field
 class OpenAIWhisperSettings:
    model_size: str = field(default='tiny', init=True)
    translate_to_english: bool = field(default=False, init=True)
@dataclass
 class FasterWhisperSettings:
    model_size: str = field(default='tiny', init=True)
    translate_to_english: bool = field(default=False, init=True)
@@ -15,6 +15,7 @@
 from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Any
 from ..helper import Results
@@ -22,5 +23,9 @@ from ..helper import Results
 class Model(ABC):
    @abstractmethod
-    def transcribe(self, result: Results, audio_file: Path) -> str:
+    def transcribe(self, result: Results, audio_file: Path) -> None:
-        return ''
+        pass
    @abstractmethod
    def set_config(self, config: Any) -> None:
        pass
@@ -37,7 +37,6 @@ class Configuration:
 class WhisperModel(Model):
    def __init__(self):
        # TODO
        self._result: str = ''
        self._config = OpenAIWhisperSettings()
@@ -45,7 +44,7 @@ class WhisperModel(Model):
    def result(self) -> str:
        return self._result
-    def transcribe(self, result: Results, audio_file: Path) -> str:
+    def transcribe(self, result: Results, audio_file: Path) -> None:
        model = whisper.load_model(self._config.model_size)
        log.debug('model size is used = %s', self._config.model_size)
        result.text = model.transcribe(audio_file)['text']  # pyright: ignore [reportAttributeAccessIssue]
@@ -38,6 +38,11 @@ class STTVoiceMessagesPlugin(GajimPlugin):
                OpenAIWhisperSettings(
                    model_size='tiny',
                    translate_to_english=False),
                ''),
            'model_faster-whisper': (
                FasterWhisperSettings(
                    model_size='tiny',
                    translate_to_english=False),
                '')
        }