initial commit

2026-01-25 18:58:40 +09:00
commit 77af47274c
101 changed files with 16247 additions and 0 deletions
--- a/ios/ExampleiOSApp/App.swift
+++ b/ios/ExampleiOSApp/App.swift
@@ -0,0 +1,10 @@
+import SwiftUI
+
+@main
+struct ExampleiOSApp: App {
+    var body: some Scene {
+        WindowGroup {
+            ContentView()
+        }
+    }
+}
--- a/ios/ExampleiOSApp/AudioPlayer.swift
+++ b/ios/ExampleiOSApp/AudioPlayer.swift
@@ -0,0 +1,30 @@
+import Foundation
+import AVFoundation
+
+final class AudioPlayer: NSObject, AVAudioPlayerDelegate {
+    private var player: AVAudioPlayer?
+    private var onFinish: (() -> Void)?
+
+    func play(url: URL, onFinish: (() -> Void)? = nil) {
+        self.onFinish = onFinish
+        do {
+            let data = try Data(contentsOf: url)
+            let player = try AVAudioPlayer(data: data)
+            player.delegate = self
+            player.prepareToPlay()
+            player.play()
+            self.player = player
+        } catch {
+            print("Audio play error: \(error)")
+        }
+    }
+
+    func stop() {
+        player?.stop()
+        player = nil
+    }
+
+    func audioPlayerDidFinishPlaying(_ player: AVAudioPlayer, successfully flag: Bool) {
+        onFinish?()
+    }
+}
--- a/ios/ExampleiOSApp/ContentView.swift
+++ b/ios/ExampleiOSApp/ContentView.swift
@@ -0,0 +1,99 @@
+import SwiftUI
+
+struct ContentView: View {
+    @StateObject private var vm = TTSViewModel()
+
+    var body: some View {
+        ZStack {
+            LinearGradient(gradient: Gradient(colors: [Color(.systemBackground), Color(.secondarySystemBackground)]), startPoint: .topLeading, endPoint: .bottomTrailing)
+                .ignoresSafeArea()
+
+            VStack(spacing: 20) {
+                Spacer()
+
+                VStack(spacing: 12) {
+                    Text("Supertonic 2 iOS Demo")
+                        .font(.title2.weight(.semibold))
+                        .foregroundColor(.primary)
+
+                    TextEditor(text: $vm.text)
+                        .frame(minHeight: 120, maxHeight: 180)
+                        .padding(8)
+                        .background(Color(.secondarySystemBackground))
+                        .cornerRadius(12)
+                        .overlay(
+                            RoundedRectangle(cornerRadius: 12)
+                                .stroke(Color.secondary.opacity(0.3), lineWidth: 1)
+                        )
+                        .padding(.horizontal)
+
+                    HStack(spacing: 12) {
+                        Text("NFE")
+                            .font(.subheadline)
+                            .foregroundColor(.secondary)
+                        Slider(value: $vm.nfe, in: 2...15, step: 1)
+                        Text("\(Int(vm.nfe))")
+                            .font(.subheadline.monospacedDigit())
+                            .frame(width: 36)
+                    }
+                    .padding(.horizontal)
+
+                    Picker("Voice", selection: $vm.voice) {
+                        Text("M").tag(TTSService.Voice.male)
+                        Text("F").tag(TTSService.Voice.female)
+                    }
+                    .pickerStyle(SegmentedPickerStyle())
+                    .padding(.horizontal)
+                    
+                    HStack(spacing: 12) {
+                        Text("Language")
+                            .font(.subheadline)
+                            .foregroundColor(.secondary)
+                        Picker("Language", selection: $vm.language) {
+                            ForEach(TTSService.Language.allCases, id: \.self) { lang in
+                                Text(lang.displayName).tag(lang)
+                            }
+                        }
+                        .pickerStyle(MenuPickerStyle())
+                    }
+                    .padding(.horizontal)
+                }
+
+                HStack(spacing: 16) {
+                    Button(action: { vm.generate() }) {
+                        Label(vm.isGenerating ? "Generating..." : "Generate", systemImage: vm.isGenerating ? "hourglass" : "wand.and.stars"
+                        )
+                        .labelStyle(.titleAndIcon)
+                    }
+                    .buttonStyle(.borderedProminent)
+                    .tint(.accentColor)
+                    .disabled(vm.isGenerating)
+
+                    Button(action: { vm.togglePlay() }) {
+                        Label(vm.isPlaying ? "Stop" : "Play", systemImage: vm.isPlaying ? "stop.fill" : "play.fill")
+                    }
+                    .buttonStyle(.bordered)
+                    .disabled(vm.audioURL == nil)
+                }
+
+                if let rtf = vm.rtfText {
+                    Text(rtf)
+                        .font(.footnote.monospacedDigit())
+                        .foregroundColor(.secondary)
+                        .padding(.top, 2)
+                }
+
+                if let error = vm.errorMessage {
+                    Text(error)
+                        .foregroundColor(.red)
+                        .font(.footnote)
+                        .multilineTextAlignment(.center)
+                        .padding(.horizontal)
+                }
+
+                Spacer()
+            }
+        }
+        .onAppear { vm.startup() }
+    }
+}
--- a/ios/ExampleiOSApp/Info.plist
+++ b/ios/ExampleiOSApp/Info.plist
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>en</string>
+	<key>CFBundleExecutable</key>
+	<string>$(EXECUTABLE_NAME)</string>
+	<key>CFBundleIdentifier</key>
+	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>ExampleiOSApp</string>
+	<key>CFBundlePackageType</key>
+	<string>APPL</string>
+	<key>CFBundleShortVersionString</key>
+	<string>1.0</string>
+	<key>CFBundleVersion</key>
+	<string>1</string>
+	<key>UILaunchScreen</key>
+	<dict/>
+	<key>UIApplicationSceneManifest</key>
+	<dict>
+		<key>UIApplicationSupportsMultipleScenes</key>
+		<false/>
+	</dict>
+</dict>
+</plist>
--- a/ios/ExampleiOSApp/TTSService.swift
+++ b/ios/ExampleiOSApp/TTSService.swift
@@ -0,0 +1,114 @@
+import Foundation
+import OnnxRuntimeBindings
+
+final class TTSService {
+    enum Voice { case male, female }
+    enum Language: String, CaseIterable {
+        case en = "en"
+        case ko = "ko"
+        case es = "es"
+        case pt = "pt"
+        case fr = "fr"
+        
+        var displayName: String {
+            switch self {
+            case .en: return "English"
+            case .ko: return "한국어"
+            case .es: return "Español"
+            case .pt: return "Português"
+            case .fr: return "Français"
+            }
+        }
+    }
+
+    private let env: ORTEnv
+    private let textToSpeech: TextToSpeech
+    private let bundleOnnxDir: String
+    private let sampleRate: Int
+
+    init() throws {
+        bundleOnnxDir = try Self.locateOnnxDirInBundle()
+        env = try ORTEnv(loggingLevel: .warning)
+        textToSpeech = try loadTextToSpeech(bundleOnnxDir, false, env)
+        sampleRate = textToSpeech.sampleRate
+    }
+
+    func synthesize(text: String, nfe: Int, voice: Voice, language: Language) async throws -> URL {
+        // Load style for the selected voice
+        let styleURL = try Self.locateVoiceStyleURL(voice: voice)
+        let style = try loadVoiceStyle([styleURL.path], verbose: false)
+
+        // 2) Synthesize via packed TextToSpeech component
+        let (wav, duration) = try textToSpeech.call(text, language.rawValue, style, nfe)
+        let audioSeconds = Double(duration)
+        let wavLenSample = min(Int(Double(sampleRate) * audioSeconds), wav.count)
+        let wavOut = Array(wav[0..<wavLenSample])
+
+        let tmpURL = FileManager.default.temporaryDirectory.appendingPathComponent("supertonic_tts_\(UUID().uuidString).wav")
+        try writeWavFile(tmpURL.path, wavOut, sampleRate)
+
+        return tmpURL
+    }
+
+    // MARK: - Resource location helpers
+    private static func locateOnnxDirInBundle() throws -> String {
+        let bundle = Bundle.main
+        let fm = FileManager.default
+
+        func dirHasRequiredFiles(_ dir: URL) -> Bool {
+            let required = [
+                "tts.json",
+                "duration_predictor.onnx",
+                "text_encoder.onnx",
+                "vector_estimator.onnx",
+                "vocoder.onnx"
+            ]
+            return required.allSatisfy { fm.fileExists(atPath: dir.appendingPathComponent($0).path) }
+        }
+
+        var candidates: [URL] = []
+        if let dir = bundle.resourceURL?.appendingPathComponent("onnx", isDirectory: true) { candidates.append(dir) }
+        if let dir = bundle.resourceURL?.appendingPathComponent("assets/onnx", isDirectory: true) { candidates.append(dir) }
+        if let url = bundle.url(forResource: "tts", withExtension: "json", subdirectory: "onnx") { candidates.append(url.deletingLastPathComponent()) }
+        if let url = bundle.url(forResource: "tts", withExtension: "json", subdirectory: "assets/onnx") { candidates.append(url.deletingLastPathComponent()) }
+        if let url = bundle.url(forResource: "tts", withExtension: "json", subdirectory: nil) { candidates.append(url.deletingLastPathComponent()) }
+        if let root = bundle.resourceURL { candidates.append(root) }
+
+        for dir in candidates {
+            if dirHasRequiredFiles(dir) { return dir.path }
+        }
+        throw NSError(
+            domain: "TTS",
+            code: -100,
+            userInfo: [NSLocalizedDescriptionKey: "Could not find the onnx directory in the bundle. Please make sure the onnx folder (as a folder reference) is included in Copy Bundle Resources in Xcode."]
+        )
+    }
+
+    private static func locateVoiceStyleURL(voice: Voice) throws -> URL {
+        // Prefer M1/F1 defaults; search common subdirectories
+        let fileName = (voice == .male) ? "M1" : "F1"
+        let bundle = Bundle.main
+        let candidates: [URL?] = [
+            bundle.url(forResource: fileName, withExtension: "json", subdirectory: "voice_styles"),
+            bundle.url(forResource: fileName, withExtension: "json", subdirectory: "assets/voice_styles"),
+            bundle.url(forResource: fileName, withExtension: "json", subdirectory: nil)
+        ]
+        for url in candidates {
+            if let url = url { return url }
+        }
+        // Fallback: scan folders if needed
+        if let folder1 = bundle.resourceURL?.appendingPathComponent("voice_styles", isDirectory: true) {
+            let file = folder1.appendingPathComponent("\(fileName).json")
+            if FileManager.default.fileExists(atPath: file.path) { return file }
+        }
+        if let folder2 = bundle.resourceURL?.appendingPathComponent("assets/voice_styles", isDirectory: true) {
+            let file = folder2.appendingPathComponent("\(fileName).json")
+            if FileManager.default.fileExists(atPath: file.path) { return file }
+        }
+        throw NSError(
+            domain: "TTS",
+            code: -102,
+            userInfo: [NSLocalizedDescriptionKey: "Could not find the voice style JSON (\(fileName).json) in the bundle. Ensure voice_styles folder is included in Copy Bundle Resources."]
+        )
+    }
+}
--- a/ios/ExampleiOSApp/TTSViewModel.swift
+++ b/ios/ExampleiOSApp/TTSViewModel.swift
@@ -0,0 +1,82 @@
+import Foundation
+import AVFoundation
+
+@MainActor
+final class TTSViewModel: ObservableObject {
+    @Published var text: String = "This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
+    @Published var nfe: Double = 5
+    @Published var voice: TTSService.Voice = .male
+    @Published var language: TTSService.Language = .en
+    @Published var isGenerating: Bool = false
+    @Published var isPlaying: Bool = false
+    @Published var errorMessage: String?
+    @Published var audioURL: URL?
+    @Published var elapsedSeconds: Double?
+    @Published var audioSeconds: Double?
+
+    private var service: TTSService?
+    private var player = AudioPlayer()
+
+    var rtfText: String? {
+        guard let e = elapsedSeconds, let a = audioSeconds, a > 0 else { return nil }
+        return String(format: "RTF %.2fx · %.2fs / %.2fs", e / a, e, a)
+    }
+
+    func startup() {
+        do {
+            service = try TTSService()
+        } catch {
+            errorMessage = "Failed to init TTS: \(error.localizedDescription)"
+        }
+    }
+
+    func generate() {
+        guard let service = service else { return }
+        isGenerating = true
+        errorMessage = nil
+        audioURL = nil
+        elapsedSeconds = nil
+        audioSeconds = nil
+        Task {
+            let tic = Date()
+            do {
+                let url = try await service.synthesize(text: text, nfe: Int(nfe), voice: voice, language: language)
+                let elapsed = Date().timeIntervalSince(tic)
+                let audio = audioDuration(at: url)
+                await MainActor.run {
+                    self.audioURL = url
+                    self.elapsedSeconds = elapsed
+                    self.audioSeconds = audio
+                    self.isGenerating = false
+                    self.play(url: url)
+                }
+            } catch {
+                await MainActor.run {
+                    self.errorMessage = error.localizedDescription
+                    self.isGenerating = false
+                }
+            }
+        }
+    }
+
+    func togglePlay() {
+        if isPlaying {
+            player.stop()
+            isPlaying = false
+        } else if let url = audioURL {
+            play(url: url)
+        }
+    }
+
+    private func play(url: URL) {
+        player.play(url: url) { [weak self] in
+            DispatchQueue.main.async { self?.isPlaying = false }
+        }
+        isPlaying = true
+    }
+
+    private func audioDuration(at url: URL) -> Double? {
+        guard let file = try? AVAudioFile(forReading: url) else { return nil }
+        return Double(file.length) / file.fileFormat.sampleRate
+    }
+}
--- a/ios/ExampleiOSApp/project.yml
+++ b/ios/ExampleiOSApp/project.yml
@@ -0,0 +1,29 @@
+name: ExampleiOSApp
+options:
+  minimumXcodeGenVersion: 2.37.0
+packages:
+  onnxruntime:
+    url: https://github.com/microsoft/onnxruntime-swift-package-manager.git
+    from: 1.16.0
+targets:
+  ExampleiOSApp:
+    type: application
+    platform: iOS
+    deploymentTarget: "15.0"
+    sources:
+      - path: .
+      - path: ../../swift/Sources/Helper.swift
+        type: file
+    resources:
+      - path: onnx
+        type: folder
+      - path: audio
+        type: folder
+    settings:
+      base:
+        PRODUCT_BUNDLE_IDENTIFIER: com.supertonic.ExampleiOSApp
+        SWIFT_VERSION: 5.9
+        INFOPLIST_FILE: Info.plist
+    dependencies:
+      - package: onnxruntime
+        product: onnxruntime
--- a/ios/README.md
+++ b/ios/README.md
@@ -0,0 +1,78 @@
+# Supertonic iOS Example App
+
+A minimal iOS demo that runs Supertonic 2 (ONNX Runtime) on-device. The app shows:
+- Multiline text input
+- NFE (denoising steps) slider
+- Voice toggle (M/F)
+- Language selector (en, ko, es, pt, fr)
+- Generate & Play buttons
+- RTF display (Elapsed / Audio seconds)
+
+All ONNX models/configs are reused from `Supertonic/assets/onnx`, and voice style JSON files from `Supertonic/assets/voice_styles`.
+
+## 📰 Update News
+
+**2026.01.06** - 🎉 **Supertonic 2** released with multilingual support! Now supports English (`en`), Korean (`ko`), Spanish (`es`), Portuguese (`pt`), and French (`fr`). [Demo](https://huggingface.co/spaces/Supertone/supertonic-2) | [Models](https://huggingface.co/Supertone/supertonic-2)
+
+**2025.12.10** - Added [6 new voice styles](https://huggingface.co/Supertone/supertonic/tree/b10dbaf18b316159be75b34d24f740008fddd381) (M3, M4, M5, F3, F4, F5). See [Voices](https://supertone-inc.github.io/supertonic-py/voices/) for details
+
+**2025.12.08** - Optimized ONNX models via [OnnxSlim](https://github.com/inisis/OnnxSlim) now available on [Hugging Face Models](https://huggingface.co/Supertone/supertonic)
+
+## Prerequisites
+- macOS 13+, Xcode 15+
+- Swift 5.9+
+- iOS 15+ device (recommended)
+- Homebrew, XcodeGen
+
+Install tools (if needed):
+```bash
+brew install xcodegen
+```
+
+## Quick Start (zero-click in Xcode)
+0) Prepare assets next to the iOS target (one-time)
+```bash
+cd ios/ExampleiOSApp
+mkdir -p onnx voice_styles
+rsync -a ../../assets/onnx/ onnx/
+rsync -a ../../assets/voice_styles/ voice_styles/
+```
+
+1) Generate the Xcode project with XcodeGen
+```bash
+xcodegen generate
+open ExampleiOSApp.xcodeproj
+```
+
+2) Open in Xcode and select your iPhone as the run destination
+- Targets → ExampleiOSApp → Signing & Capabilities: Select your Team
+- iOS Deployment Target: 15.0+
+
+3) Build & Run on device
+- Type text → adjust NFE/Voice → Tap Generate → Audio plays automatically
+- An RTF line shows like: `RTF 0.30x · 3.04s / 10.11s`
+
+## What's included (generated project)
+- SwiftUI app files: `App.swift`, `ContentView.swift`, `TTSViewModel.swift`, `AudioPlayer.swift`
+- Runtime wrapper: `TTSService.swift` (includes TTS inference logic)
+- Resources (local, vendored in `ios/ExampleiOSApp/onnx` and `ios/ExampleiOSApp/voice_styles` after step 0)
+
+These references are defined in `project.yml` and added to the app bundle by XcodeGen.
+
+## App Controls
+- **Text**: Multiline `TextEditor`
+- **NFE**: Denoising steps (default 5)
+- **Voice**: M/F voice style selector
+- **Language**: Language selector (English, 한국어, Español, Português, Français)
+- **Generate**: Runs end-to-end synthesis
+- **Play/Stop**: Controls playback of the last output
+- **RTF**: Shows Elapsed / Audio seconds for quick performance intuition
+
+## Multilingual Support
+
+Supertonic 2 supports multiple languages. Select the appropriate language for your input text:
+- **English (en)**: Default language
+- **한국어 (ko)**: Korean
+- **Español (es)**: Spanish
+- **Português (pt)**: Portuguese
+- **Français (fr)**: French