Init
This commit is contained in:
102
src/spectral-synthesis/README.md
Normal file
102
src/spectral-synthesis/README.md
Normal file
@@ -0,0 +1,102 @@
|
||||
# Spectral Synthesis Library
|
||||
|
||||
A library for synthesizing audio from images using additive synthesis and perceptual frequency mapping.
|
||||
|
||||
## Features
|
||||
|
||||
- **Image-to-Audio Synthesis**: Convert any image to audio by treating it as a spectrogram
|
||||
- **Perceptual Accuracy**: Uses Mel-scale frequency mapping for better perceptual results
|
||||
- **Spectral Peak Detection**: Only synthesizes significant frequency components
|
||||
- **Temporal Smoothing**: Maintains coherent trajectories between time frames
|
||||
- **Auto-Detection**: Automatically handles different image types (spectrograms vs diagrams)
|
||||
|
||||
## Quick Start
|
||||
|
||||
```typescript
|
||||
import { synthesizeFromImage, downloadWAV } from './spectral-synthesis'
|
||||
|
||||
// Simple usage
|
||||
const audioData = synthesizeFromImage(imageData)
|
||||
|
||||
// With custom parameters
|
||||
const audioData = synthesizeFromImage(imageData, {
|
||||
duration: 10,
|
||||
minFreq: 100,
|
||||
maxFreq: 10000,
|
||||
maxPartials: 200
|
||||
})
|
||||
|
||||
// Export as WAV
|
||||
downloadWAV(audioData, 44100, 'my-audio.wav')
|
||||
```
|
||||
|
||||
## API Reference
|
||||
|
||||
### Main Functions
|
||||
|
||||
#### `synthesizeFromImage(imageData, params?)`
|
||||
- **imageData**: `ImageData` - Canvas image data
|
||||
- **params**: `Partial<SynthesisParams>` - Optional parameters
|
||||
- **Returns**: `Float32Array` - Audio samples
|
||||
|
||||
### Types
|
||||
|
||||
#### `SynthesisParams`
|
||||
```typescript
|
||||
interface SynthesisParams {
|
||||
duration: number // Audio duration in seconds
|
||||
minFreq: number // Minimum frequency in Hz
|
||||
maxFreq: number // Maximum frequency in Hz
|
||||
sampleRate: number // Sample rate in Hz
|
||||
frequencyResolution: number // Frequency bin downsampling
|
||||
timeResolution: number // Time slice downsampling
|
||||
amplitudeThreshold: number // Minimum amplitude threshold
|
||||
maxPartials: number // Maximum simultaneous partials
|
||||
}
|
||||
```
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
spectral-synthesis/
|
||||
├── core/
|
||||
│ ├── types.ts # Type definitions
|
||||
│ ├── utils.ts # Helper functions
|
||||
│ └── synthesizer.ts # Main synthesis logic
|
||||
├── audio/
|
||||
│ └── export.ts # Audio export utilities
|
||||
└── index.ts # Main exports
|
||||
```
|
||||
|
||||
## Algorithm
|
||||
|
||||
1. **Image Analysis**: Auto-detect if colors should be inverted
|
||||
2. **Frequency Mapping**: Convert image rows to Mel-scale frequencies
|
||||
3. **Peak Detection**: Find significant spectral components
|
||||
4. **Temporal Smoothing**: Apply continuity between time frames
|
||||
5. **Perceptual Weighting**: Apply psychoacoustic amplitude scaling
|
||||
6. **Additive Synthesis**: Generate and sum sine waves
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Basic Synthesis
|
||||
```typescript
|
||||
const canvas = document.createElement('canvas')
|
||||
const ctx = canvas.getContext('2d')
|
||||
// ... load image to canvas
|
||||
const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height)
|
||||
const audio = synthesizeFromImage(imageData)
|
||||
```
|
||||
|
||||
### Advanced Usage
|
||||
```typescript
|
||||
import { ImageToAudioSynthesizer } from './spectral-synthesis'
|
||||
|
||||
const synthesizer = new ImageToAudioSynthesizer({
|
||||
duration: 5,
|
||||
maxPartials: 150
|
||||
})
|
||||
|
||||
const result = synthesizer.synthesize(imageData)
|
||||
console.log(`Generated ${result.duration}s of audio`)
|
||||
```
|
||||
78
src/spectral-synthesis/audio/export.ts
Normal file
78
src/spectral-synthesis/audio/export.ts
Normal file
@@ -0,0 +1,78 @@
|
||||
/**
|
||||
* Create WAV buffer from audio data
|
||||
*/
|
||||
export function createWAVBuffer(audioData: Float32Array, sampleRate: number): ArrayBuffer {
|
||||
const length = audioData.length
|
||||
const buffer = new ArrayBuffer(44 + length * 2)
|
||||
const view = new DataView(buffer)
|
||||
|
||||
// WAV header
|
||||
writeString(view, 0, 'RIFF')
|
||||
view.setUint32(4, 36 + length * 2, true) // file length - 8
|
||||
writeString(view, 8, 'WAVE')
|
||||
writeString(view, 12, 'fmt ')
|
||||
view.setUint32(16, 16, true) // format chunk length
|
||||
view.setUint16(20, 1, true) // PCM format
|
||||
view.setUint16(22, 1, true) // mono
|
||||
view.setUint32(24, sampleRate, true)
|
||||
view.setUint32(28, sampleRate * 2, true) // byte rate
|
||||
view.setUint16(32, 2, true) // block align
|
||||
view.setUint16(34, 16, true) // bits per sample
|
||||
writeString(view, 36, 'data')
|
||||
view.setUint32(40, length * 2, true) // data chunk length
|
||||
|
||||
// Convert float samples to 16-bit PCM
|
||||
let offset = 44
|
||||
for (let i = 0; i < length; i++) {
|
||||
const sample = Math.max(-1, Math.min(1, audioData[i]))
|
||||
view.setInt16(offset, sample * 0x7FFF, true)
|
||||
offset += 2
|
||||
}
|
||||
|
||||
return buffer
|
||||
}
|
||||
|
||||
function writeString(view: DataView, offset: number, string: string) {
|
||||
for (let i = 0; i < string.length; i++) {
|
||||
view.setUint8(offset + i, string.charCodeAt(i))
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Download audio as WAV file
|
||||
*/
|
||||
export function downloadWAV(audioData: Float32Array, sampleRate: number, filename: string) {
|
||||
const buffer = createWAVBuffer(audioData, sampleRate)
|
||||
const blob = new Blob([buffer], { type: 'audio/wav' })
|
||||
const url = URL.createObjectURL(blob)
|
||||
|
||||
const a = document.createElement('a')
|
||||
a.href = url
|
||||
a.download = filename
|
||||
a.click()
|
||||
|
||||
URL.revokeObjectURL(url)
|
||||
}
|
||||
|
||||
/**
|
||||
* Play audio in browser
|
||||
*/
|
||||
export async function playAudio(audioData: Float32Array, sampleRate: number): Promise<void> {
|
||||
const audioContext = new (window.AudioContext || (window as any).webkitAudioContext)()
|
||||
|
||||
if (audioContext.sampleRate !== sampleRate) {
|
||||
console.warn(`Audio context sample rate (${audioContext.sampleRate}) differs from data sample rate (${sampleRate})`)
|
||||
}
|
||||
|
||||
const buffer = audioContext.createBuffer(1, audioData.length, sampleRate)
|
||||
buffer.copyToChannel(audioData, 0)
|
||||
|
||||
const source = audioContext.createBufferSource()
|
||||
source.buffer = buffer
|
||||
source.connect(audioContext.destination)
|
||||
source.start()
|
||||
|
||||
return new Promise(resolve => {
|
||||
source.onended = () => resolve()
|
||||
})
|
||||
}
|
||||
175
src/spectral-synthesis/core/synthesizer.ts
Normal file
175
src/spectral-synthesis/core/synthesizer.ts
Normal file
@@ -0,0 +1,175 @@
|
||||
import type { SynthesisParams, SpectralPeak, SynthesisResult } from './types'
|
||||
import {
|
||||
hzToMel,
|
||||
melToHz,
|
||||
detectSpectralPeaks,
|
||||
perceptualAmplitudeWeighting,
|
||||
shouldInvertImage,
|
||||
extractSpectrum,
|
||||
applyWindow
|
||||
} from './utils'
|
||||
|
||||
export class ImageToAudioSynthesizer {
|
||||
private params: SynthesisParams
|
||||
|
||||
constructor(params: Partial<SynthesisParams> = {}) {
|
||||
this.params = {
|
||||
duration: 5,
|
||||
minFreq: 20,
|
||||
maxFreq: 20000,
|
||||
sampleRate: 44100,
|
||||
frequencyResolution: 1,
|
||||
timeResolution: 1,
|
||||
amplitudeThreshold: 0.01,
|
||||
maxPartials: 100,
|
||||
windowType: 'hann',
|
||||
...params
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Synthesize audio from image data
|
||||
*/
|
||||
synthesize(imageData: ImageData): SynthesisResult {
|
||||
const { width, height, data } = imageData
|
||||
const {
|
||||
duration,
|
||||
minFreq,
|
||||
maxFreq,
|
||||
sampleRate,
|
||||
frequencyResolution,
|
||||
timeResolution,
|
||||
amplitudeThreshold,
|
||||
maxPartials,
|
||||
windowType
|
||||
} = this.params
|
||||
|
||||
// Detect image type
|
||||
const invert = shouldInvertImage(imageData)
|
||||
|
||||
// Calculate synthesis parameters
|
||||
const totalSamples = Math.floor(duration * sampleRate)
|
||||
const effectiveWidth = Math.floor(width / timeResolution)
|
||||
const effectiveHeight = Math.floor(height / frequencyResolution)
|
||||
const samplesPerColumn = totalSamples / effectiveWidth
|
||||
const audio = new Float32Array(totalSamples)
|
||||
|
||||
// Pre-calculate mel-scale frequency mapping
|
||||
const minMel = hzToMel(minFreq)
|
||||
const maxMel = hzToMel(maxFreq)
|
||||
|
||||
// Storage for temporal smoothing
|
||||
const previousAmplitudes = new Float32Array(effectiveHeight)
|
||||
const smoothingFactor = 0.3
|
||||
|
||||
// Process each time slice
|
||||
for (let col = 0; col < effectiveWidth; col++) {
|
||||
const sourceCol = col * timeResolution
|
||||
const startSample = Math.floor(col * samplesPerColumn)
|
||||
const endSample = Math.floor((col + 1) * samplesPerColumn)
|
||||
|
||||
// Extract spectrum for this time slice
|
||||
const spectrum = extractSpectrum(imageData, sourceCol, effectiveHeight, frequencyResolution, invert)
|
||||
|
||||
// Detect spectral peaks
|
||||
const peaks = detectSpectralPeaks(spectrum, Math.min(amplitudeThreshold, 0.01))
|
||||
|
||||
// Generate partials from peaks
|
||||
const partials: SpectralPeak[] = []
|
||||
|
||||
for (const peakRow of peaks) {
|
||||
// Mel-scale frequency mapping (high freq at top)
|
||||
const melValue = maxMel - (peakRow / (effectiveHeight - 1)) * (maxMel - minMel)
|
||||
const frequency = melToHz(melValue)
|
||||
|
||||
let amplitude = spectrum[peakRow]
|
||||
|
||||
// Apply temporal smoothing
|
||||
if (col > 0) {
|
||||
amplitude = smoothingFactor * previousAmplitudes[peakRow] + (1 - smoothingFactor) * amplitude
|
||||
}
|
||||
previousAmplitudes[peakRow] = amplitude
|
||||
|
||||
// Apply perceptual weighting
|
||||
amplitude = perceptualAmplitudeWeighting(frequency, amplitude)
|
||||
|
||||
// Use zero phase for simplicity
|
||||
const phase = 0
|
||||
|
||||
if (amplitude > Math.min(amplitudeThreshold, 0.005)) {
|
||||
partials.push({ frequency, amplitude, phase })
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by amplitude and limit partials
|
||||
partials.sort((a, b) => b.amplitude - a.amplitude)
|
||||
const limitedPartials = partials.slice(0, maxPartials)
|
||||
|
||||
// Generate audio for this time slice
|
||||
const chunkLength = endSample - startSample
|
||||
const audioChunk = new Float32Array(chunkLength)
|
||||
|
||||
for (const { frequency, amplitude, phase } of limitedPartials) {
|
||||
for (let i = 0; i < chunkLength; i++) {
|
||||
const t = (startSample + i) / sampleRate
|
||||
audioChunk[i] += amplitude * Math.sin(2 * Math.PI * frequency * t + phase)
|
||||
}
|
||||
}
|
||||
|
||||
// Apply windowing to reduce artifacts
|
||||
const windowedChunk = applyWindow(audioChunk, windowType)
|
||||
|
||||
// Add windowed chunk to final audio
|
||||
for (let i = 0; i < chunkLength && startSample + i < totalSamples; i++) {
|
||||
audio[startSample + i] += windowedChunk[i]
|
||||
}
|
||||
}
|
||||
|
||||
// Normalize to prevent clipping
|
||||
let maxAmplitude = 0
|
||||
for (let i = 0; i < audio.length; i++) {
|
||||
const absValue = Math.abs(audio[i])
|
||||
if (absValue > maxAmplitude) {
|
||||
maxAmplitude = absValue
|
||||
}
|
||||
}
|
||||
|
||||
if (maxAmplitude > 1) {
|
||||
for (let i = 0; i < audio.length; i++) {
|
||||
audio[i] /= maxAmplitude
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
audio,
|
||||
sampleRate,
|
||||
duration
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update synthesis parameters
|
||||
*/
|
||||
updateParams(newParams: Partial<SynthesisParams>): void {
|
||||
this.params = { ...this.params, ...newParams }
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current parameters
|
||||
*/
|
||||
getParams(): SynthesisParams {
|
||||
return { ...this.params }
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience function for quick synthesis
|
||||
*/
|
||||
export function synthesizeFromImage(
|
||||
imageData: ImageData,
|
||||
params: Partial<SynthesisParams> = {}
|
||||
): Float32Array {
|
||||
const synthesizer = new ImageToAudioSynthesizer(params)
|
||||
const result = synthesizer.synthesize(imageData)
|
||||
return result.audio
|
||||
}
|
||||
25
src/spectral-synthesis/core/types.ts
Normal file
25
src/spectral-synthesis/core/types.ts
Normal file
@@ -0,0 +1,25 @@
|
||||
export type WindowType = 'rectangular' | 'hann' | 'hamming' | 'blackman'
|
||||
|
||||
export interface SynthesisParams {
|
||||
duration: number
|
||||
minFreq: number
|
||||
maxFreq: number
|
||||
sampleRate: number
|
||||
frequencyResolution: number
|
||||
timeResolution: number
|
||||
amplitudeThreshold: number
|
||||
maxPartials: number
|
||||
windowType: WindowType
|
||||
}
|
||||
|
||||
export interface SpectralPeak {
|
||||
frequency: number
|
||||
amplitude: number
|
||||
phase: number
|
||||
}
|
||||
|
||||
export interface SynthesisResult {
|
||||
audio: Float32Array
|
||||
sampleRate: number
|
||||
duration: number
|
||||
}
|
||||
143
src/spectral-synthesis/core/utils.ts
Normal file
143
src/spectral-synthesis/core/utils.ts
Normal file
@@ -0,0 +1,143 @@
|
||||
/**
|
||||
* Convert frequency from Hz to Mel scale
|
||||
*/
|
||||
export function hzToMel(freq: number): number {
|
||||
return 2595 * Math.log10(1 + freq / 700)
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert frequency from Mel scale to Hz
|
||||
*/
|
||||
export function melToHz(mel: number): number {
|
||||
return 700 * (Math.pow(10, mel / 2595) - 1)
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect spectral peaks in amplitude spectrum
|
||||
*/
|
||||
export function detectSpectralPeaks(spectrum: number[], threshold: number = 0.01): number[] {
|
||||
const peaks: number[] = []
|
||||
|
||||
// Find significant components above threshold
|
||||
for (let i = 0; i < spectrum.length; i++) {
|
||||
if (spectrum[i] > threshold) {
|
||||
peaks.push(i)
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: use local maxima with lower threshold if no peaks found
|
||||
if (peaks.length === 0) {
|
||||
for (let i = 1; i < spectrum.length - 1; i++) {
|
||||
if (spectrum[i] > spectrum[i - 1] &&
|
||||
spectrum[i] > spectrum[i + 1] &&
|
||||
spectrum[i] > 0.001) {
|
||||
peaks.push(i)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return peaks
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply perceptual amplitude weighting
|
||||
*/
|
||||
export function perceptualAmplitudeWeighting(freq: number, amplitude: number): number {
|
||||
// Gentle boost around 1kHz for perceptual accuracy
|
||||
const normalizedFreq = Math.log10(freq / 1000)
|
||||
const weight = Math.exp(-normalizedFreq * normalizedFreq * 0.5) * 0.5 + 0.5
|
||||
return amplitude * weight
|
||||
}
|
||||
|
||||
/**
|
||||
* Auto-detect if image colors should be inverted
|
||||
*/
|
||||
export function shouldInvertImage(imageData: ImageData): boolean {
|
||||
const { width, height, data } = imageData
|
||||
let totalBrightness = 0
|
||||
|
||||
for (let i = 0; i < data.length; i += 4) {
|
||||
const gray = 0.299 * data[i] + 0.587 * data[i + 1] + 0.114 * data[i + 2]
|
||||
totalBrightness += gray / 255
|
||||
}
|
||||
|
||||
const meanBrightness = totalBrightness / (width * height)
|
||||
return meanBrightness > 0.5 // Bright background = diagram
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate windowing function
|
||||
*/
|
||||
export function generateWindow(length: number, windowType: string): Float32Array {
|
||||
const window = new Float32Array(length)
|
||||
|
||||
switch (windowType) {
|
||||
case 'hann':
|
||||
for (let i = 0; i < length; i++) {
|
||||
window[i] = 0.5 * (1 - Math.cos(2 * Math.PI * i / (length - 1)))
|
||||
}
|
||||
break
|
||||
|
||||
case 'hamming':
|
||||
for (let i = 0; i < length; i++) {
|
||||
window[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (length - 1))
|
||||
}
|
||||
break
|
||||
|
||||
case 'blackman':
|
||||
for (let i = 0; i < length; i++) {
|
||||
const factor = 2 * Math.PI * i / (length - 1)
|
||||
window[i] = 0.42 - 0.5 * Math.cos(factor) + 0.08 * Math.cos(2 * factor)
|
||||
}
|
||||
break
|
||||
|
||||
case 'rectangular':
|
||||
default:
|
||||
window.fill(1.0)
|
||||
break
|
||||
}
|
||||
|
||||
return window
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply windowing function to audio chunk
|
||||
*/
|
||||
export function applyWindow(audioChunk: Float32Array, windowType: string): Float32Array {
|
||||
const window = generateWindow(audioChunk.length, windowType)
|
||||
const windowed = new Float32Array(audioChunk.length)
|
||||
|
||||
for (let i = 0; i < audioChunk.length; i++) {
|
||||
windowed[i] = audioChunk[i] * window[i]
|
||||
}
|
||||
|
||||
return windowed
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract grayscale spectrum from image column
|
||||
*/
|
||||
export function extractSpectrum(
|
||||
imageData: ImageData,
|
||||
col: number,
|
||||
height: number,
|
||||
frequencyResolution: number,
|
||||
invert: boolean
|
||||
): number[] {
|
||||
const { width, data } = imageData
|
||||
const spectrum: number[] = []
|
||||
|
||||
for (let row = 0; row < height; row++) {
|
||||
const sourceRow = row * frequencyResolution
|
||||
const idx = (sourceRow * width + col) * 4
|
||||
const r = data[idx]
|
||||
const g = data[idx + 1]
|
||||
const b = data[idx + 2]
|
||||
|
||||
let amplitude = (0.299 * r + 0.587 * g + 0.114 * b) / 255
|
||||
if (invert) amplitude = 1 - amplitude
|
||||
spectrum.push(amplitude)
|
||||
}
|
||||
|
||||
return spectrum
|
||||
}
|
||||
22
src/spectral-synthesis/index.ts
Normal file
22
src/spectral-synthesis/index.ts
Normal file
@@ -0,0 +1,22 @@
|
||||
// Core synthesis
|
||||
export { ImageToAudioSynthesizer, synthesizeFromImage } from './core/synthesizer'
|
||||
export type { SynthesisParams, SpectralPeak, SynthesisResult, WindowType } from './core/types'
|
||||
|
||||
// Utilities
|
||||
export {
|
||||
hzToMel,
|
||||
melToHz,
|
||||
detectSpectralPeaks,
|
||||
perceptualAmplitudeWeighting,
|
||||
shouldInvertImage,
|
||||
extractSpectrum,
|
||||
generateWindow,
|
||||
applyWindow
|
||||
} from './core/utils'
|
||||
|
||||
// Audio export
|
||||
export {
|
||||
createWAVBuffer,
|
||||
downloadWAV,
|
||||
playAudio
|
||||
} from './audio/export'
|
||||
Reference in New Issue
Block a user