This commit is contained in:
2025-09-27 12:00:17 +02:00
commit b564e41820
60 changed files with 7925 additions and 0 deletions

View File

@@ -0,0 +1,102 @@
# Spectral Synthesis Library
A library for synthesizing audio from images using additive synthesis and perceptual frequency mapping.
## Features
- **Image-to-Audio Synthesis**: Convert any image to audio by treating it as a spectrogram
- **Perceptual Accuracy**: Uses Mel-scale frequency mapping for better perceptual results
- **Spectral Peak Detection**: Only synthesizes significant frequency components
- **Temporal Smoothing**: Maintains coherent trajectories between time frames
- **Auto-Detection**: Automatically handles different image types (spectrograms vs diagrams)
## Quick Start
```typescript
import { synthesizeFromImage, downloadWAV } from './spectral-synthesis'
// Simple usage
const audioData = synthesizeFromImage(imageData)
// With custom parameters
const audioData = synthesizeFromImage(imageData, {
duration: 10,
minFreq: 100,
maxFreq: 10000,
maxPartials: 200
})
// Export as WAV
downloadWAV(audioData, 44100, 'my-audio.wav')
```
## API Reference
### Main Functions
#### `synthesizeFromImage(imageData, params?)`
- **imageData**: `ImageData` - Canvas image data
- **params**: `Partial<SynthesisParams>` - Optional parameters
- **Returns**: `Float32Array` - Audio samples
### Types
#### `SynthesisParams`
```typescript
interface SynthesisParams {
duration: number // Audio duration in seconds
minFreq: number // Minimum frequency in Hz
maxFreq: number // Maximum frequency in Hz
sampleRate: number // Sample rate in Hz
frequencyResolution: number // Frequency bin downsampling
timeResolution: number // Time slice downsampling
amplitudeThreshold: number // Minimum amplitude threshold
maxPartials: number // Maximum simultaneous partials
}
```
## Project Structure
```
spectral-synthesis/
├── core/
│ ├── types.ts # Type definitions
│ ├── utils.ts # Helper functions
│ └── synthesizer.ts # Main synthesis logic
├── audio/
│ └── export.ts # Audio export utilities
└── index.ts # Main exports
```
## Algorithm
1. **Image Analysis**: Auto-detect if colors should be inverted
2. **Frequency Mapping**: Convert image rows to Mel-scale frequencies
3. **Peak Detection**: Find significant spectral components
4. **Temporal Smoothing**: Apply continuity between time frames
5. **Perceptual Weighting**: Apply psychoacoustic amplitude scaling
6. **Additive Synthesis**: Generate and sum sine waves
## Usage Examples
### Basic Synthesis
```typescript
const canvas = document.createElement('canvas')
const ctx = canvas.getContext('2d')
// ... load image to canvas
const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height)
const audio = synthesizeFromImage(imageData)
```
### Advanced Usage
```typescript
import { ImageToAudioSynthesizer } from './spectral-synthesis'
const synthesizer = new ImageToAudioSynthesizer({
duration: 5,
maxPartials: 150
})
const result = synthesizer.synthesize(imageData)
console.log(`Generated ${result.duration}s of audio`)
```

View File

@@ -0,0 +1,78 @@
/**
* Create WAV buffer from audio data
*/
export function createWAVBuffer(audioData: Float32Array, sampleRate: number): ArrayBuffer {
const length = audioData.length
const buffer = new ArrayBuffer(44 + length * 2)
const view = new DataView(buffer)
// WAV header
writeString(view, 0, 'RIFF')
view.setUint32(4, 36 + length * 2, true) // file length - 8
writeString(view, 8, 'WAVE')
writeString(view, 12, 'fmt ')
view.setUint32(16, 16, true) // format chunk length
view.setUint16(20, 1, true) // PCM format
view.setUint16(22, 1, true) // mono
view.setUint32(24, sampleRate, true)
view.setUint32(28, sampleRate * 2, true) // byte rate
view.setUint16(32, 2, true) // block align
view.setUint16(34, 16, true) // bits per sample
writeString(view, 36, 'data')
view.setUint32(40, length * 2, true) // data chunk length
// Convert float samples to 16-bit PCM
let offset = 44
for (let i = 0; i < length; i++) {
const sample = Math.max(-1, Math.min(1, audioData[i]))
view.setInt16(offset, sample * 0x7FFF, true)
offset += 2
}
return buffer
}
function writeString(view: DataView, offset: number, string: string) {
for (let i = 0; i < string.length; i++) {
view.setUint8(offset + i, string.charCodeAt(i))
}
}
/**
* Download audio as WAV file
*/
export function downloadWAV(audioData: Float32Array, sampleRate: number, filename: string) {
const buffer = createWAVBuffer(audioData, sampleRate)
const blob = new Blob([buffer], { type: 'audio/wav' })
const url = URL.createObjectURL(blob)
const a = document.createElement('a')
a.href = url
a.download = filename
a.click()
URL.revokeObjectURL(url)
}
/**
* Play audio in browser
*/
export async function playAudio(audioData: Float32Array, sampleRate: number): Promise<void> {
const audioContext = new (window.AudioContext || (window as any).webkitAudioContext)()
if (audioContext.sampleRate !== sampleRate) {
console.warn(`Audio context sample rate (${audioContext.sampleRate}) differs from data sample rate (${sampleRate})`)
}
const buffer = audioContext.createBuffer(1, audioData.length, sampleRate)
buffer.copyToChannel(audioData, 0)
const source = audioContext.createBufferSource()
source.buffer = buffer
source.connect(audioContext.destination)
source.start()
return new Promise(resolve => {
source.onended = () => resolve()
})
}

View File

@@ -0,0 +1,175 @@
import type { SynthesisParams, SpectralPeak, SynthesisResult } from './types'
import {
hzToMel,
melToHz,
detectSpectralPeaks,
perceptualAmplitudeWeighting,
shouldInvertImage,
extractSpectrum,
applyWindow
} from './utils'
export class ImageToAudioSynthesizer {
private params: SynthesisParams
constructor(params: Partial<SynthesisParams> = {}) {
this.params = {
duration: 5,
minFreq: 20,
maxFreq: 20000,
sampleRate: 44100,
frequencyResolution: 1,
timeResolution: 1,
amplitudeThreshold: 0.01,
maxPartials: 100,
windowType: 'hann',
...params
}
}
/**
* Synthesize audio from image data
*/
synthesize(imageData: ImageData): SynthesisResult {
const { width, height, data } = imageData
const {
duration,
minFreq,
maxFreq,
sampleRate,
frequencyResolution,
timeResolution,
amplitudeThreshold,
maxPartials,
windowType
} = this.params
// Detect image type
const invert = shouldInvertImage(imageData)
// Calculate synthesis parameters
const totalSamples = Math.floor(duration * sampleRate)
const effectiveWidth = Math.floor(width / timeResolution)
const effectiveHeight = Math.floor(height / frequencyResolution)
const samplesPerColumn = totalSamples / effectiveWidth
const audio = new Float32Array(totalSamples)
// Pre-calculate mel-scale frequency mapping
const minMel = hzToMel(minFreq)
const maxMel = hzToMel(maxFreq)
// Storage for temporal smoothing
const previousAmplitudes = new Float32Array(effectiveHeight)
const smoothingFactor = 0.3
// Process each time slice
for (let col = 0; col < effectiveWidth; col++) {
const sourceCol = col * timeResolution
const startSample = Math.floor(col * samplesPerColumn)
const endSample = Math.floor((col + 1) * samplesPerColumn)
// Extract spectrum for this time slice
const spectrum = extractSpectrum(imageData, sourceCol, effectiveHeight, frequencyResolution, invert)
// Detect spectral peaks
const peaks = detectSpectralPeaks(spectrum, Math.min(amplitudeThreshold, 0.01))
// Generate partials from peaks
const partials: SpectralPeak[] = []
for (const peakRow of peaks) {
// Mel-scale frequency mapping (high freq at top)
const melValue = maxMel - (peakRow / (effectiveHeight - 1)) * (maxMel - minMel)
const frequency = melToHz(melValue)
let amplitude = spectrum[peakRow]
// Apply temporal smoothing
if (col > 0) {
amplitude = smoothingFactor * previousAmplitudes[peakRow] + (1 - smoothingFactor) * amplitude
}
previousAmplitudes[peakRow] = amplitude
// Apply perceptual weighting
amplitude = perceptualAmplitudeWeighting(frequency, amplitude)
// Use zero phase for simplicity
const phase = 0
if (amplitude > Math.min(amplitudeThreshold, 0.005)) {
partials.push({ frequency, amplitude, phase })
}
}
// Sort by amplitude and limit partials
partials.sort((a, b) => b.amplitude - a.amplitude)
const limitedPartials = partials.slice(0, maxPartials)
// Generate audio for this time slice
const chunkLength = endSample - startSample
const audioChunk = new Float32Array(chunkLength)
for (const { frequency, amplitude, phase } of limitedPartials) {
for (let i = 0; i < chunkLength; i++) {
const t = (startSample + i) / sampleRate
audioChunk[i] += amplitude * Math.sin(2 * Math.PI * frequency * t + phase)
}
}
// Apply windowing to reduce artifacts
const windowedChunk = applyWindow(audioChunk, windowType)
// Add windowed chunk to final audio
for (let i = 0; i < chunkLength && startSample + i < totalSamples; i++) {
audio[startSample + i] += windowedChunk[i]
}
}
// Normalize to prevent clipping
let maxAmplitude = 0
for (let i = 0; i < audio.length; i++) {
const absValue = Math.abs(audio[i])
if (absValue > maxAmplitude) {
maxAmplitude = absValue
}
}
if (maxAmplitude > 1) {
for (let i = 0; i < audio.length; i++) {
audio[i] /= maxAmplitude
}
}
return {
audio,
sampleRate,
duration
}
}
/**
* Update synthesis parameters
*/
updateParams(newParams: Partial<SynthesisParams>): void {
this.params = { ...this.params, ...newParams }
}
/**
* Get current parameters
*/
getParams(): SynthesisParams {
return { ...this.params }
}
}
/**
* Convenience function for quick synthesis
*/
export function synthesizeFromImage(
imageData: ImageData,
params: Partial<SynthesisParams> = {}
): Float32Array {
const synthesizer = new ImageToAudioSynthesizer(params)
const result = synthesizer.synthesize(imageData)
return result.audio
}

View File

@@ -0,0 +1,25 @@
export type WindowType = 'rectangular' | 'hann' | 'hamming' | 'blackman'
export interface SynthesisParams {
duration: number
minFreq: number
maxFreq: number
sampleRate: number
frequencyResolution: number
timeResolution: number
amplitudeThreshold: number
maxPartials: number
windowType: WindowType
}
export interface SpectralPeak {
frequency: number
amplitude: number
phase: number
}
export interface SynthesisResult {
audio: Float32Array
sampleRate: number
duration: number
}

View File

@@ -0,0 +1,143 @@
/**
* Convert frequency from Hz to Mel scale
*/
export function hzToMel(freq: number): number {
return 2595 * Math.log10(1 + freq / 700)
}
/**
* Convert frequency from Mel scale to Hz
*/
export function melToHz(mel: number): number {
return 700 * (Math.pow(10, mel / 2595) - 1)
}
/**
* Detect spectral peaks in amplitude spectrum
*/
export function detectSpectralPeaks(spectrum: number[], threshold: number = 0.01): number[] {
const peaks: number[] = []
// Find significant components above threshold
for (let i = 0; i < spectrum.length; i++) {
if (spectrum[i] > threshold) {
peaks.push(i)
}
}
// Fallback: use local maxima with lower threshold if no peaks found
if (peaks.length === 0) {
for (let i = 1; i < spectrum.length - 1; i++) {
if (spectrum[i] > spectrum[i - 1] &&
spectrum[i] > spectrum[i + 1] &&
spectrum[i] > 0.001) {
peaks.push(i)
}
}
}
return peaks
}
/**
* Apply perceptual amplitude weighting
*/
export function perceptualAmplitudeWeighting(freq: number, amplitude: number): number {
// Gentle boost around 1kHz for perceptual accuracy
const normalizedFreq = Math.log10(freq / 1000)
const weight = Math.exp(-normalizedFreq * normalizedFreq * 0.5) * 0.5 + 0.5
return amplitude * weight
}
/**
* Auto-detect if image colors should be inverted
*/
export function shouldInvertImage(imageData: ImageData): boolean {
const { width, height, data } = imageData
let totalBrightness = 0
for (let i = 0; i < data.length; i += 4) {
const gray = 0.299 * data[i] + 0.587 * data[i + 1] + 0.114 * data[i + 2]
totalBrightness += gray / 255
}
const meanBrightness = totalBrightness / (width * height)
return meanBrightness > 0.5 // Bright background = diagram
}
/**
* Generate windowing function
*/
export function generateWindow(length: number, windowType: string): Float32Array {
const window = new Float32Array(length)
switch (windowType) {
case 'hann':
for (let i = 0; i < length; i++) {
window[i] = 0.5 * (1 - Math.cos(2 * Math.PI * i / (length - 1)))
}
break
case 'hamming':
for (let i = 0; i < length; i++) {
window[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (length - 1))
}
break
case 'blackman':
for (let i = 0; i < length; i++) {
const factor = 2 * Math.PI * i / (length - 1)
window[i] = 0.42 - 0.5 * Math.cos(factor) + 0.08 * Math.cos(2 * factor)
}
break
case 'rectangular':
default:
window.fill(1.0)
break
}
return window
}
/**
* Apply windowing function to audio chunk
*/
export function applyWindow(audioChunk: Float32Array, windowType: string): Float32Array {
const window = generateWindow(audioChunk.length, windowType)
const windowed = new Float32Array(audioChunk.length)
for (let i = 0; i < audioChunk.length; i++) {
windowed[i] = audioChunk[i] * window[i]
}
return windowed
}
/**
* Extract grayscale spectrum from image column
*/
export function extractSpectrum(
imageData: ImageData,
col: number,
height: number,
frequencyResolution: number,
invert: boolean
): number[] {
const { width, data } = imageData
const spectrum: number[] = []
for (let row = 0; row < height; row++) {
const sourceRow = row * frequencyResolution
const idx = (sourceRow * width + col) * 4
const r = data[idx]
const g = data[idx + 1]
const b = data[idx + 2]
let amplitude = (0.299 * r + 0.587 * g + 0.114 * b) / 255
if (invert) amplitude = 1 - amplitude
spectrum.push(amplitude)
}
return spectrum
}

View File

@@ -0,0 +1,22 @@
// Core synthesis
export { ImageToAudioSynthesizer, synthesizeFromImage } from './core/synthesizer'
export type { SynthesisParams, SpectralPeak, SynthesisResult, WindowType } from './core/types'
// Utilities
export {
hzToMel,
melToHz,
detectSpectralPeaks,
perceptualAmplitudeWeighting,
shouldInvertImage,
extractSpectrum,
generateWindow,
applyWindow
} from './core/utils'
// Audio export
export {
createWAVBuffer,
downloadWAV,
playAudio
} from './audio/export'