Browse Source

enhance: use ios new speech api for transcribe

Tienson Qin 3 months ago
parent
commit
6b55a90ea1

+ 1 - 0
android/app/capacitor.build.gradle

@@ -14,6 +14,7 @@ dependencies {
     implementation project(':capacitor-app')
     implementation project(':capacitor-camera')
     implementation project(':capacitor-clipboard')
+    implementation project(':capacitor-device')
     implementation project(':capacitor-filesystem')
     implementation project(':capacitor-haptics')
     implementation project(':capacitor-keyboard')

+ 4 - 0
android/app/src/main/assets/capacitor.plugins.json

@@ -19,6 +19,10 @@
 		"pkg": "@capacitor/clipboard",
 		"classpath": "com.capacitorjs.plugins.clipboard.ClipboardPlugin"
 	},
+	{
+		"pkg": "@capacitor/device",
+		"classpath": "com.capacitorjs.plugins.device.DevicePlugin"
+	},
 	{
 		"pkg": "@capacitor/filesystem",
 		"classpath": "com.capacitorjs.plugins.filesystem.FilesystemPlugin"

+ 3 - 0
android/capacitor.settings.gradle

@@ -17,6 +17,9 @@ project(':capacitor-camera').projectDir = new File('../node_modules/@capacitor/c
 include ':capacitor-clipboard'
 project(':capacitor-clipboard').projectDir = new File('../node_modules/@capacitor/clipboard/android')
 
+include ':capacitor-device'
+project(':capacitor-device').projectDir = new File('../node_modules/@capacitor/device/android')
+
 include ':capacitor-filesystem'
 project(':capacitor-filesystem').projectDir = new File('../node_modules/@capacitor/filesystem/android')
 

+ 59 - 41
ios/App/App/UILocalPlugin.swift

@@ -209,51 +209,71 @@ public class UILocalPlugin: CAPPlugin, CAPBridgedPlugin {
     CAPPluginMethod(name: "transcribeAudio2Text", returnType: CAPPluginReturnPromise)
   ]
 
-  // TODO: switch to use https://developer.apple.com/documentation/speech/speechanalyzer for iOS 26+
-  // 语音识别方法
-  private func recognizeSpeech(from url: URL, completion: @escaping (String?, Error?) -> Void) {
-      SFSpeechRecognizer.requestAuthorization { authStatus in
-          guard authStatus == .authorized else {
-              completion(nil, NSError(domain: "", code: -1, userInfo: [NSLocalizedDescriptionKey: "语音识别权限未授权"]))
-              return
-          }
-
-          let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))
-          let request = SFSpeechURLRecognitionRequest(url: url)
-
-          // Setting up offline speech recognition
-          recognizer?.supportsOnDeviceRecognition = true
-          request.shouldReportPartialResults = false
-          request.requiresOnDeviceRecognition = true
-          request.taskHint = .dictation
-          if #available(iOS 16, *) {
-              request.addsPunctuation = true
-          }
+  func recognizeSpeech(from file: URL, locale: String, completion: @escaping (String?, Error?) -> Void) {
+        if #available(iOS 26.0, *) {
+            // Modern API: SpeechTranscriber + SpeechAnalyzer
+            Task {
+                do {
+                    print("debug locale \(locale)")
+
+                    // Step 1: pick supported locale
+                    guard let supportedLocale = await SpeechTranscriber.supportedLocale(equivalentTo: Locale(identifier: locale)) else {
+                        throw NSError(domain: "Speech", code: -1,
+                                      userInfo: [NSLocalizedDescriptionKey: "Unsupported locale"])
+                    }
+
+                    // Step 2: transcriber with transcription preset
+                    let transcriber = SpeechTranscriber(locale: supportedLocale, preset: .transcription)
+
+                    // Ensure assets (downloads model if needed)
+                    if let installRequest = try await AssetInventory.assetInstallationRequest(supporting: [transcriber]) {
+                        try await installRequest.downloadAndInstall()
+                    }
+
+                    // Step 3: collect transcription results async
+                    async let transcriptionFuture: String = try transcriber.results.reduce(into: "") { partial, result in
+                        partial += String(result.text.characters) + " "
+                    }
+
+                    // Step 4: analyzer
+                    let analyzer = SpeechAnalyzer(modules: [transcriber])
+
+                    // Step 5/6: run analysis from file
+                    let audioFile = try AVAudioFile(forReading: file)
+                    if let lastSample = try await analyzer.analyzeSequence(from: audioFile) {
+                        try await analyzer.finalizeAndFinish(through: lastSample)
+                    } else {
+                        try await analyzer.cancelAndFinishNow()
+                    }
+
+                    // Step 7/8: wait for transcription
+                    let finalText = try await transcriptionFuture.trimmingCharacters(in: .whitespacesAndNewlines)
+                    completion(finalText, nil)
+
+                } catch {
+                    completion(nil, error)
+                }
+            }
 
-          recognizer?.recognitionTask(with: request) { result, error in
-              if let result = result {
-                  let transcription = result.bestTranscription.formattedString
-                  completion(transcription, nil)
-              } else if let error = error {
-                  completion(nil, error)
-              }
-          }
-      }
-  }
+        }
+    }
 
   @objc func transcribeAudio2Text(_ call: CAPPluginCall) {
     self.call = call
 
-    // 接收音频数据 arrayBuffer
+    // audio arrayBuffer
     guard let audioArray = call.getArray("audioData", NSNumber.self) as? [UInt8] else {
-      call.reject("无效的音频数据")
+      call.reject("invalid audioData")
       return
     }
 
-    // 将数组转换为 Data
+    guard let locale = call.getString("locale") else {
+        call.reject("invalid locale")
+        return
+    }
+
     let audioData = Data(audioArray)
 
-    // 保存为本地文件
     let fileURL = FileManager.default.temporaryDirectory.appendingPathComponent("recordedAudio.m4a")
 
     do {
@@ -261,23 +281,21 @@ public class UILocalPlugin: CAPPlugin, CAPBridgedPlugin {
 
       let fileExists = FileManager.default.fileExists(atPath: fileURL.path)
 
-      print("文件是否存在: \(fileExists), 路径: \(fileURL.path)")
+      print("file exists: \(fileExists), path: \(fileURL.path)")
       if !fileExists {
-          call.reject("文件保存失败,文件不存在")
+          call.reject("file save failed: file doesn't exist")
           return
       }
 
-
-      // 调用语音识别
-      self.recognizeSpeech(from: fileURL) { result, error in
+      self.recognizeSpeech(from: fileURL, locale: locale) { result, error in
           if let result = result {
             call.resolve(["transcription": result])
           } else if let error = error {
-            call.reject("语音识别失败: \(error.localizedDescription)")
+            call.reject("failed to transcribe: \(error.localizedDescription)")
           }
         }
     } catch {
-      call.reject("保存文件失败: \(error.localizedDescription)")
+      call.reject("failed to transcribe: \(error.localizedDescription)")
     }
   }
 

+ 1 - 0
ios/App/Podfile

@@ -16,6 +16,7 @@ def capacitor_pods
   pod 'CapacitorApp', :path => '../../node_modules/@capacitor/app'
   pod 'CapacitorCamera', :path => '../../node_modules/@capacitor/camera'
   pod 'CapacitorClipboard', :path => '../../node_modules/@capacitor/clipboard'
+  pod 'CapacitorDevice', :path => '../../node_modules/@capacitor/device'
   pod 'CapacitorFilesystem', :path => '../../node_modules/@capacitor/filesystem'
   pod 'CapacitorHaptics', :path => '../../node_modules/@capacitor/haptics'
   pod 'CapacitorKeyboard', :path => '../../node_modules/@capacitor/keyboard'

+ 7 - 1
ios/App/Podfile.lock

@@ -12,6 +12,8 @@ PODS:
   - CapacitorCommunitySafeArea (7.0.0-alpha.1):
     - Capacitor
   - CapacitorCordova (7.2.0)
+  - CapacitorDevice (7.0.2):
+    - Capacitor
   - CapacitorFilesystem (7.0.1):
     - Capacitor
   - CapacitorHaptics (7.0.1):
@@ -39,6 +41,7 @@ DEPENDENCIES:
   - "CapacitorClipboard (from `../../node_modules/@capacitor/clipboard`)"
   - "CapacitorCommunitySafeArea (from `../../node_modules/@capacitor-community/safe-area`)"
   - "CapacitorCordova (from `../../node_modules/@capacitor/ios`)"
+  - "CapacitorDevice (from `../../node_modules/@capacitor/device`)"
   - "CapacitorFilesystem (from `../../node_modules/@capacitor/filesystem`)"
   - "CapacitorHaptics (from `../../node_modules/@capacitor/haptics`)"
   - "CapacitorKeyboard (from `../../node_modules/@capacitor/keyboard`)"
@@ -64,6 +67,8 @@ EXTERNAL SOURCES:
     :path: "../../node_modules/@capacitor-community/safe-area"
   CapacitorCordova:
     :path: "../../node_modules/@capacitor/ios"
+  CapacitorDevice:
+    :path: "../../node_modules/@capacitor/device"
   CapacitorFilesystem:
     :path: "../../node_modules/@capacitor/filesystem"
   CapacitorHaptics:
@@ -91,6 +96,7 @@ SPEC CHECKSUMS:
   CapacitorClipboard: b98aead5dc7ec595547fc2c5d75bacd2ae3338bc
   CapacitorCommunitySafeArea: cc370b4f8d4aa340e4616acef9b73eda41ba0914
   CapacitorCordova: 5967b9ba03915ef1d585469d6e31f31dc49be96f
+  CapacitorDevice: a50a45f0d075e55e2392c7a4be5404d4f69515de
   CapacitorFilesystem: 307f97c27a265edf8396a1c9c235592fd8572fe3
   CapacitorHaptics: 70e47470fa1a6bd6338cd102552e3846b7f9a1b3
   CapacitorKeyboard: 969647d0ca2e5c737d7300088e2517aa832434e2
@@ -101,6 +107,6 @@ SPEC CHECKSUMS:
   JcesarmobileSslSkip: b0f921e9d397a57f7983731209ca1ee244119c1f
   SendIntent: 1f4f65c7103eb423067c566682dfcda973b5fb29
 
-PODFILE CHECKSUM: d1ad773ee5fbd3415c2d78d69f4396a1dc68bed9
+PODFILE CHECKSUM: cb9c70caa3eda97256a3dae9041478673def76cd
 
 COCOAPODS: 1.16.2

+ 1 - 0
package.json

@@ -115,6 +115,7 @@
         "@capacitor/camera": "7.0.1",
         "@capacitor/clipboard": "7.0.1",
         "@capacitor/core": "7.2.0",
+        "@capacitor/device": "^7.0.2",
         "@capacitor/filesystem": "7.0.1",
         "@capacitor/haptics": "7.0.1",
         "@capacitor/ios": "7.2.0",

+ 57 - 30
src/main/mobile/components/recorder.cljs

@@ -1,6 +1,7 @@
 (ns mobile.components.recorder
   "Audio record"
-  (:require [cljs-time.core :as t]
+  (:require ["@capacitor/device" :refer [Device]]
+            [cljs-time.core :as t]
             [clojure.string :as string]
             [frontend.date :as date]
             [frontend.db.model :as db-model]
@@ -28,8 +29,20 @@
     (str (.padStart (str minutes) 2 "0") ":"
          (.padStart (str seconds) 2 "0"))))
 
+(defn- get-locale
+  []
+  (->
+   (p/let [^js lang (.getLanguageTag ^js Device)
+           value (.-value lang)]
+     (if (= value "en_CN")
+       "zh"
+       (string/replace value "-" "_")))
+   (p/catch (fn [e]
+              (js/console.error e)
+              "en_US"))))
+
 (defn save-asset-audio!
-  [blob]
+  [blob locale]
   (let [ext (some-> blob
                     (.-type)
                     (string/split ";")
@@ -53,7 +66,8 @@
         (when asset-entity
           (p/let [buffer-data (.arrayBuffer blob)
                   unit8-data (js/Uint8Array. buffer-data)]
-            (-> (.transcribeAudio2Text mobile-util/ui-local #js {:audioData (js/Array.from unit8-data)})
+            (-> (.transcribeAudio2Text mobile-util/ui-local #js {:audioData (js/Array.from unit8-data)
+                                                                 :locale locale})
                 (p/then (fn [^js r]
                           (let [content (.-transcription r)]
                             (when-not (string/blank? content)
@@ -93,7 +107,7 @@
          ;; events
          (doto r
            (.on "record-end" (fn [^js blob]
-                               (save-asset-audio! blob)
+                               (save-asset-audio! blob "en_US")
                                (mobile-state/close-popup!)))
            (.on "record-progress" (gfun/throttle
                                    (fn [time]
@@ -120,31 +134,44 @@
 
 (rum/defc record-button-2
   []
-  (hooks/use-effect!
-   (fn []
-     (record/start
-      {:on-record-end (fn [^js blob]
-                        (save-asset-audio! blob)
-                        (mobile-state/close-popup!))})
-     (record/attach-visualizer!
-      (js/document.getElementById "wave-canvas")
-      {:mode :rolling
-       :fps 30
-       :fft-size 2048
-       :smoothing 0.8})
-
-     #(record/destroy!))
-   [])
-  [:div.p-6.flex.justify-between
-   [:div.flex.justify-between.items-center.w-full
-      ;; [:span.flex.flex-col.timer-wrap
-      ;;  [:strong.timer {:ref *timer-ref} "00:00"]
-      ;;  [:small "05:00"]]
-    (shui/button {:variant :outline
-                  :class "record-ctrl-btn rounded-full recording"
-                  :on-click (fn []
-                              (record/stop))}
-                 (shui/tabler-icon "player-stop" {:size 22}))]])
+  (let [[locale set-locale!] (hooks/use-state nil)
+        [*locale] (hooks/use-state (atom nil))]
+    (hooks/use-effect!
+     (fn []
+       (p/let [locale (get-locale)]
+         (set-locale! locale)
+         (reset! *locale locale)
+         (record/start
+          {:on-record-end (fn [^js blob]
+                            (save-asset-audio! blob @*locale)
+                            (mobile-state/close-popup!))})
+         (record/attach-visualizer!
+          (js/document.getElementById "wave-canvas")
+          {:mode :rolling
+           :fps 30
+           :fft-size 2048
+           :smoothing 0.8}))
+
+       #(record/destroy!))
+     [])
+    [:div.p-6.flex.justify-between
+     [:div.flex.justify-between.items-center.w-full
+    ;; [:span.flex.flex-col.timer-wrap
+    ;;  [:strong.timer "00:00"]
+    ;;  [:small "05:00"]]
+      (shui/button {:variant :outline
+                    :class "record-ctrl-btn rounded-full recording"
+                    :on-click (fn []
+                                (record/stop))}
+                   (shui/tabler-icon "player-stop" {:size 22}))
+
+      (when locale
+        (when-not (string/starts-with? locale "en_")
+          (shui/button {:variant :outline
+                        :on-click (fn []
+                                    (reset! *locale "en_US")
+                                    (set-locale! "en_US"))}
+                       "English transcribe")))]]))
 
 (rum/defc audio-recorder-aux < rum/static
   []
@@ -158,7 +185,7 @@
     [:div.wave.border.rounded
      [:canvas#wave-canvas
       {:height 200
-       :width 400}]]]
+       :width 320}]]]
 
    ;; (record-button)
    (record-button-2)])

+ 5 - 0
yarn.lock

@@ -291,6 +291,11 @@
   dependencies:
     tslib "^2.1.0"
 
+"@capacitor/device@^7.0.2":
+  version "7.0.2"
+  resolved "https://registry.yarnpkg.com/@capacitor/device/-/device-7.0.2.tgz#406bde129d3fcf55f0de0b691509535e2a00e315"
+  integrity sha512-OMGMBjLbh7ApaqW1oOJIV73iyrFK/T5v2MzuQYq3GLT+jnGvCuj/y82Ofq2Fz9/hlJ2fukztPwG1K80jyk0i6w==
+
 "@capacitor/[email protected]":
   version "7.0.1"
   resolved "https://registry.yarnpkg.com/@capacitor/filesystem/-/filesystem-7.0.1.tgz#b0518d781f7640e936f529b80a59724e221d0471"