initial commit

This commit is contained in:
2026-01-25 18:58:40 +09:00
commit 77af47274c
101 changed files with 16247 additions and 0 deletions

View File

@@ -0,0 +1,10 @@
import SwiftUI
@main
struct ExampleiOSApp: App {
var body: some Scene {
WindowGroup {
ContentView()
}
}
}

View File

@@ -0,0 +1,30 @@
import Foundation
import AVFoundation
final class AudioPlayer: NSObject, AVAudioPlayerDelegate {
private var player: AVAudioPlayer?
private var onFinish: (() -> Void)?
func play(url: URL, onFinish: (() -> Void)? = nil) {
self.onFinish = onFinish
do {
let data = try Data(contentsOf: url)
let player = try AVAudioPlayer(data: data)
player.delegate = self
player.prepareToPlay()
player.play()
self.player = player
} catch {
print("Audio play error: \(error)")
}
}
func stop() {
player?.stop()
player = nil
}
func audioPlayerDidFinishPlaying(_ player: AVAudioPlayer, successfully flag: Bool) {
onFinish?()
}
}

View File

@@ -0,0 +1,99 @@
import SwiftUI
struct ContentView: View {
@StateObject private var vm = TTSViewModel()
var body: some View {
ZStack {
LinearGradient(gradient: Gradient(colors: [Color(.systemBackground), Color(.secondarySystemBackground)]), startPoint: .topLeading, endPoint: .bottomTrailing)
.ignoresSafeArea()
VStack(spacing: 20) {
Spacer()
VStack(spacing: 12) {
Text("Supertonic 2 iOS Demo")
.font(.title2.weight(.semibold))
.foregroundColor(.primary)
TextEditor(text: $vm.text)
.frame(minHeight: 120, maxHeight: 180)
.padding(8)
.background(Color(.secondarySystemBackground))
.cornerRadius(12)
.overlay(
RoundedRectangle(cornerRadius: 12)
.stroke(Color.secondary.opacity(0.3), lineWidth: 1)
)
.padding(.horizontal)
HStack(spacing: 12) {
Text("NFE")
.font(.subheadline)
.foregroundColor(.secondary)
Slider(value: $vm.nfe, in: 2...15, step: 1)
Text("\(Int(vm.nfe))")
.font(.subheadline.monospacedDigit())
.frame(width: 36)
}
.padding(.horizontal)
Picker("Voice", selection: $vm.voice) {
Text("M").tag(TTSService.Voice.male)
Text("F").tag(TTSService.Voice.female)
}
.pickerStyle(SegmentedPickerStyle())
.padding(.horizontal)
HStack(spacing: 12) {
Text("Language")
.font(.subheadline)
.foregroundColor(.secondary)
Picker("Language", selection: $vm.language) {
ForEach(TTSService.Language.allCases, id: \.self) { lang in
Text(lang.displayName).tag(lang)
}
}
.pickerStyle(MenuPickerStyle())
}
.padding(.horizontal)
}
HStack(spacing: 16) {
Button(action: { vm.generate() }) {
Label(vm.isGenerating ? "Generating..." : "Generate", systemImage: vm.isGenerating ? "hourglass" : "wand.and.stars"
)
.labelStyle(.titleAndIcon)
}
.buttonStyle(.borderedProminent)
.tint(.accentColor)
.disabled(vm.isGenerating)
Button(action: { vm.togglePlay() }) {
Label(vm.isPlaying ? "Stop" : "Play", systemImage: vm.isPlaying ? "stop.fill" : "play.fill")
}
.buttonStyle(.bordered)
.disabled(vm.audioURL == nil)
}
if let rtf = vm.rtfText {
Text(rtf)
.font(.footnote.monospacedDigit())
.foregroundColor(.secondary)
.padding(.top, 2)
}
if let error = vm.errorMessage {
Text(error)
.foregroundColor(.red)
.font(.footnote)
.multilineTextAlignment(.center)
.padding(.horizontal)
}
Spacer()
}
}
.onAppear { vm.startup() }
}
}

View File

@@ -0,0 +1,29 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>CFBundleDevelopmentRegion</key>
<string>en</string>
<key>CFBundleExecutable</key>
<string>$(EXECUTABLE_NAME)</string>
<key>CFBundleIdentifier</key>
<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
<key>CFBundleInfoDictionaryVersion</key>
<string>6.0</string>
<key>CFBundleName</key>
<string>ExampleiOSApp</string>
<key>CFBundlePackageType</key>
<string>APPL</string>
<key>CFBundleShortVersionString</key>
<string>1.0</string>
<key>CFBundleVersion</key>
<string>1</string>
<key>UILaunchScreen</key>
<dict/>
<key>UIApplicationSceneManifest</key>
<dict>
<key>UIApplicationSupportsMultipleScenes</key>
<false/>
</dict>
</dict>
</plist>

View File

@@ -0,0 +1,114 @@
import Foundation
import OnnxRuntimeBindings
final class TTSService {
enum Voice { case male, female }
enum Language: String, CaseIterable {
case en = "en"
case ko = "ko"
case es = "es"
case pt = "pt"
case fr = "fr"
var displayName: String {
switch self {
case .en: return "English"
case .ko: return "한국어"
case .es: return "Español"
case .pt: return "Português"
case .fr: return "Français"
}
}
}
private let env: ORTEnv
private let textToSpeech: TextToSpeech
private let bundleOnnxDir: String
private let sampleRate: Int
init() throws {
bundleOnnxDir = try Self.locateOnnxDirInBundle()
env = try ORTEnv(loggingLevel: .warning)
textToSpeech = try loadTextToSpeech(bundleOnnxDir, false, env)
sampleRate = textToSpeech.sampleRate
}
func synthesize(text: String, nfe: Int, voice: Voice, language: Language) async throws -> URL {
// Load style for the selected voice
let styleURL = try Self.locateVoiceStyleURL(voice: voice)
let style = try loadVoiceStyle([styleURL.path], verbose: false)
// 2) Synthesize via packed TextToSpeech component
let (wav, duration) = try textToSpeech.call(text, language.rawValue, style, nfe)
let audioSeconds = Double(duration)
let wavLenSample = min(Int(Double(sampleRate) * audioSeconds), wav.count)
let wavOut = Array(wav[0..<wavLenSample])
let tmpURL = FileManager.default.temporaryDirectory.appendingPathComponent("supertonic_tts_\(UUID().uuidString).wav")
try writeWavFile(tmpURL.path, wavOut, sampleRate)
return tmpURL
}
// MARK: - Resource location helpers
private static func locateOnnxDirInBundle() throws -> String {
let bundle = Bundle.main
let fm = FileManager.default
func dirHasRequiredFiles(_ dir: URL) -> Bool {
let required = [
"tts.json",
"duration_predictor.onnx",
"text_encoder.onnx",
"vector_estimator.onnx",
"vocoder.onnx"
]
return required.allSatisfy { fm.fileExists(atPath: dir.appendingPathComponent($0).path) }
}
var candidates: [URL] = []
if let dir = bundle.resourceURL?.appendingPathComponent("onnx", isDirectory: true) { candidates.append(dir) }
if let dir = bundle.resourceURL?.appendingPathComponent("assets/onnx", isDirectory: true) { candidates.append(dir) }
if let url = bundle.url(forResource: "tts", withExtension: "json", subdirectory: "onnx") { candidates.append(url.deletingLastPathComponent()) }
if let url = bundle.url(forResource: "tts", withExtension: "json", subdirectory: "assets/onnx") { candidates.append(url.deletingLastPathComponent()) }
if let url = bundle.url(forResource: "tts", withExtension: "json", subdirectory: nil) { candidates.append(url.deletingLastPathComponent()) }
if let root = bundle.resourceURL { candidates.append(root) }
for dir in candidates {
if dirHasRequiredFiles(dir) { return dir.path }
}
throw NSError(
domain: "TTS",
code: -100,
userInfo: [NSLocalizedDescriptionKey: "Could not find the onnx directory in the bundle. Please make sure the onnx folder (as a folder reference) is included in Copy Bundle Resources in Xcode."]
)
}
private static func locateVoiceStyleURL(voice: Voice) throws -> URL {
// Prefer M1/F1 defaults; search common subdirectories
let fileName = (voice == .male) ? "M1" : "F1"
let bundle = Bundle.main
let candidates: [URL?] = [
bundle.url(forResource: fileName, withExtension: "json", subdirectory: "voice_styles"),
bundle.url(forResource: fileName, withExtension: "json", subdirectory: "assets/voice_styles"),
bundle.url(forResource: fileName, withExtension: "json", subdirectory: nil)
]
for url in candidates {
if let url = url { return url }
}
// Fallback: scan folders if needed
if let folder1 = bundle.resourceURL?.appendingPathComponent("voice_styles", isDirectory: true) {
let file = folder1.appendingPathComponent("\(fileName).json")
if FileManager.default.fileExists(atPath: file.path) { return file }
}
if let folder2 = bundle.resourceURL?.appendingPathComponent("assets/voice_styles", isDirectory: true) {
let file = folder2.appendingPathComponent("\(fileName).json")
if FileManager.default.fileExists(atPath: file.path) { return file }
}
throw NSError(
domain: "TTS",
code: -102,
userInfo: [NSLocalizedDescriptionKey: "Could not find the voice style JSON (\(fileName).json) in the bundle. Ensure voice_styles folder is included in Copy Bundle Resources."]
)
}
}

View File

@@ -0,0 +1,82 @@
import Foundation
import AVFoundation
@MainActor
final class TTSViewModel: ObservableObject {
@Published var text: String = "This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
@Published var nfe: Double = 5
@Published var voice: TTSService.Voice = .male
@Published var language: TTSService.Language = .en
@Published var isGenerating: Bool = false
@Published var isPlaying: Bool = false
@Published var errorMessage: String?
@Published var audioURL: URL?
@Published var elapsedSeconds: Double?
@Published var audioSeconds: Double?
private var service: TTSService?
private var player = AudioPlayer()
var rtfText: String? {
guard let e = elapsedSeconds, let a = audioSeconds, a > 0 else { return nil }
return String(format: "RTF %.2fx · %.2fs / %.2fs", e / a, e, a)
}
func startup() {
do {
service = try TTSService()
} catch {
errorMessage = "Failed to init TTS: \(error.localizedDescription)"
}
}
func generate() {
guard let service = service else { return }
isGenerating = true
errorMessage = nil
audioURL = nil
elapsedSeconds = nil
audioSeconds = nil
Task {
let tic = Date()
do {
let url = try await service.synthesize(text: text, nfe: Int(nfe), voice: voice, language: language)
let elapsed = Date().timeIntervalSince(tic)
let audio = audioDuration(at: url)
await MainActor.run {
self.audioURL = url
self.elapsedSeconds = elapsed
self.audioSeconds = audio
self.isGenerating = false
self.play(url: url)
}
} catch {
await MainActor.run {
self.errorMessage = error.localizedDescription
self.isGenerating = false
}
}
}
}
func togglePlay() {
if isPlaying {
player.stop()
isPlaying = false
} else if let url = audioURL {
play(url: url)
}
}
private func play(url: URL) {
player.play(url: url) { [weak self] in
DispatchQueue.main.async { self?.isPlaying = false }
}
isPlaying = true
}
private func audioDuration(at url: URL) -> Double? {
guard let file = try? AVAudioFile(forReading: url) else { return nil }
return Double(file.length) / file.fileFormat.sampleRate
}
}

View File

@@ -0,0 +1,29 @@
name: ExampleiOSApp
options:
minimumXcodeGenVersion: 2.37.0
packages:
onnxruntime:
url: https://github.com/microsoft/onnxruntime-swift-package-manager.git
from: 1.16.0
targets:
ExampleiOSApp:
type: application
platform: iOS
deploymentTarget: "15.0"
sources:
- path: .
- path: ../../swift/Sources/Helper.swift
type: file
resources:
- path: onnx
type: folder
- path: audio
type: folder
settings:
base:
PRODUCT_BUNDLE_IDENTIFIER: com.supertonic.ExampleiOSApp
SWIFT_VERSION: 5.9
INFOPLIST_FILE: Info.plist
dependencies:
- package: onnxruntime
product: onnxruntime

78
ios/README.md Normal file
View File

@@ -0,0 +1,78 @@
# Supertonic iOS Example App
A minimal iOS demo that runs Supertonic 2 (ONNX Runtime) on-device. The app shows:
- Multiline text input
- NFE (denoising steps) slider
- Voice toggle (M/F)
- Language selector (en, ko, es, pt, fr)
- Generate & Play buttons
- RTF display (Elapsed / Audio seconds)
All ONNX models/configs are reused from `Supertonic/assets/onnx`, and voice style JSON files from `Supertonic/assets/voice_styles`.
## 📰 Update News
**2026.01.06** - 🎉 **Supertonic 2** released with multilingual support! Now supports English (`en`), Korean (`ko`), Spanish (`es`), Portuguese (`pt`), and French (`fr`). [Demo](https://huggingface.co/spaces/Supertone/supertonic-2) | [Models](https://huggingface.co/Supertone/supertonic-2)
**2025.12.10** - Added [6 new voice styles](https://huggingface.co/Supertone/supertonic/tree/b10dbaf18b316159be75b34d24f740008fddd381) (M3, M4, M5, F3, F4, F5). See [Voices](https://supertone-inc.github.io/supertonic-py/voices/) for details
**2025.12.08** - Optimized ONNX models via [OnnxSlim](https://github.com/inisis/OnnxSlim) now available on [Hugging Face Models](https://huggingface.co/Supertone/supertonic)
## Prerequisites
- macOS 13+, Xcode 15+
- Swift 5.9+
- iOS 15+ device (recommended)
- Homebrew, XcodeGen
Install tools (if needed):
```bash
brew install xcodegen
```
## Quick Start (zero-click in Xcode)
0) Prepare assets next to the iOS target (one-time)
```bash
cd ios/ExampleiOSApp
mkdir -p onnx voice_styles
rsync -a ../../assets/onnx/ onnx/
rsync -a ../../assets/voice_styles/ voice_styles/
```
1) Generate the Xcode project with XcodeGen
```bash
xcodegen generate
open ExampleiOSApp.xcodeproj
```
2) Open in Xcode and select your iPhone as the run destination
- Targets → ExampleiOSApp → Signing & Capabilities: Select your Team
- iOS Deployment Target: 15.0+
3) Build & Run on device
- Type text → adjust NFE/Voice → Tap Generate → Audio plays automatically
- An RTF line shows like: `RTF 0.30x · 3.04s / 10.11s`
## What's included (generated project)
- SwiftUI app files: `App.swift`, `ContentView.swift`, `TTSViewModel.swift`, `AudioPlayer.swift`
- Runtime wrapper: `TTSService.swift` (includes TTS inference logic)
- Resources (local, vendored in `ios/ExampleiOSApp/onnx` and `ios/ExampleiOSApp/voice_styles` after step 0)
These references are defined in `project.yml` and added to the app bundle by XcodeGen.
## App Controls
- **Text**: Multiline `TextEditor`
- **NFE**: Denoising steps (default 5)
- **Voice**: M/F voice style selector
- **Language**: Language selector (English, 한국어, Español, Português, Français)
- **Generate**: Runs end-to-end synthesis
- **Play/Stop**: Controls playback of the last output
- **RTF**: Shows Elapsed / Audio seconds for quick performance intuition
## Multilingual Support
Supertonic 2 supports multiple languages. Select the appropriate language for your input text:
- **English (en)**: Default language
- **한국어 (ko)**: Korean
- **Español (es)**: Spanish
- **Português (pt)**: Portuguese
- **Français (fr)**: French