ai iteration

This commit is contained in:
2025-08-08 18:27:10 +02:00
parent 512521e9d0
commit 1d51176311
9 changed files with 2880 additions and 49 deletions

2
.gitignore vendored
View File

@@ -4,3 +4,5 @@ node_modules/
.env
stories/sample_story/
dist/

2687
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -2,7 +2,6 @@
"name": "project-noctivus",
"version": "1.0.0",
"description": "An orchestrator for creating audiobooks from text.",
"type": "module",
"main": "dist/orchestrator.js",
"scripts": {
"start": "ts-node src/orchestrator.ts",

View File

@@ -1,6 +1,7 @@
import { spawn } from "child_process";
import { StoryConfig } from "./config";
import * as path from "path";
import * as fs from "fs";
const ffmpeg = require("ffmpeg-static");
const ffprobe = require("ffprobe-static");
@@ -58,10 +59,13 @@ export async function generateSilence(duration: number, outputFile: string): Pro
export async function combineAudio(storyName: string, storyConfig: StoryConfig, audioFiles: string[]): Promise<void> {
const introFile = path.resolve("stories", storyName, storyConfig.config.intro_audio_file);
const outroFile = path.resolve("stories", storyName, storyConfig.config.outro_audio_file);
const tempAudioFile = path.resolve("stories", storyName, "final_audio", "temp.mp3");
const finalAudioFile = path.resolve("stories", storyName, "final_audio", "final.mp3");
const finalAudioDir = path.resolve("stories", storyName, "final_audio");
const tempAudioFile = path.join(finalAudioDir, "temp.mp3");
const finalAudioFile = path.join(finalAudioDir, "final.mp3");
const backgroundMusicFile = path.resolve("stories", storyName, storyConfig.config.background_music_file);
fs.mkdirSync(finalAudioDir, { recursive: true });
// First, concatenate the main audio files
const allFiles = [introFile, ...audioFiles.map((f) => path.resolve(f)), outroFile];
const fileList = allFiles.map((f) => `file '${f.replace(/'/g, "'\\''")}'`).join("\n");
@@ -85,18 +89,28 @@ export async function combineAudio(storyName: string, storyConfig: StoryConfig,
// Then, get the duration of the concatenated audio
const duration = await getDuration(tempAudioFile);
// Generate silence for the background track
await generateSilence(duration, backgroundMusicFile);
if (!fs.existsSync(backgroundMusicFile)) {
// If background music is missing, just copy the narration
await new Promise<void>((resolve, reject) => {
const args = ["-y", "-i", tempAudioFile, "-c:a", "libmp3lame", "-q:a", "4", finalAudioFile];
const p = spawn(ffmpeg, args);
p.on("close", (code: any) => (code === 0 ? resolve() : reject(new Error(`ffmpeg copy failed ${code}`))));
});
return;
}
// Finally, mix the main audio with the background music
// Finally, mix the main audio with the looped/trimmed background music at a lower volume
const bgVolume = "0.2";
const mixArgs = [
"-y",
"-i",
tempAudioFile,
"-stream_loop",
"-1",
"-i",
backgroundMusicFile,
"-filter_complex",
"[0:a][1:a]amerge=inputs=2[a]",
`[1:a]volume=${bgVolume},atrim=0:${duration},asetpts=N/SR/TB[bg];[0:a][bg]amix=inputs=2:duration=first:dropout_transition=0[a]`,
"-map",
"[a]",
"-c:a",

View File

@@ -1,6 +1,7 @@
import * as yaml from "js-yaml";
import * as fs from "fs";
import * as path from "path";
import { z } from "zod";
export interface StoryConfig {
metadata: {
@@ -13,18 +14,43 @@ export interface StoryConfig {
config: {
chunk_size: number;
tts_voice_id: string;
tts_instructions: string;
image_style_prompts: string;
tts_instructions?: string;
image_style_prompts?: string;
intro_audio_file: string;
outro_audio_file: string;
background_music_file: string;
export_settings: {
format: string;
format?: string;
resolution: string;
};
};
}
const StoryConfigSchema = z.object({
metadata: z.object({
title: z.string().min(1),
author: z.string().min(1),
publication_year: z.number().int(),
public_domain_proof_url: z.string().min(1),
reading_level: z.string().min(1),
}),
config: z.object({
chunk_size: z.number().int().positive(),
tts_voice_id: z.string().min(1),
tts_instructions: z.string().optional().default(""),
image_style_prompts: z.string().optional().default(""),
intro_audio_file: z.string().min(1),
outro_audio_file: z.string().min(1),
background_music_file: z.string().min(1),
export_settings: z
.object({
format: z.string().optional().default("mp4"),
resolution: z.string().regex(/^\d+x\d+$/).default("1024x1024"),
})
.default({ format: "mp4", resolution: "1024x1024" }),
}),
});
export function loadStoryConfig(storyName: string): StoryConfig {
const configPath = path.join("stories", storyName, "config.yaml");
if (!fs.existsSync(configPath)) {
@@ -32,5 +58,7 @@ export function loadStoryConfig(storyName: string): StoryConfig {
}
const fileContents = fs.readFileSync(configPath, "utf8");
return yaml.load(fileContents) as StoryConfig;
const loaded = yaml.load(fileContents);
const parsed = StoryConfigSchema.parse(loaded);
return parsed as unknown as StoryConfig;
}

View File

@@ -7,6 +7,31 @@ const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
const allowedSizes = [
"256x256",
"512x512",
"1024x1024",
"1536x1024",
"1024x1536",
"1792x1024",
"1024x1792",
] as const;
type AllowedSize = (typeof allowedSizes)[number];
function pickImageSize(resolution?: string): AllowedSize {
// Default square for simplicity
if (!resolution) return "1024x1024";
const match = resolution.match(/^(\d+)x(\d+)$/);
if (!match) return "1024x1024";
const width = parseInt(match[1], 10);
const height = parseInt(match[2], 10);
if (!Number.isFinite(width) || !Number.isFinite(height)) return "1024x1024";
if (width === height) return "1024x1024";
const landscapeCandidates: AllowedSize[] = ["1536x1024", "1792x1024"];
const portraitCandidates: AllowedSize[] = ["1024x1536", "1024x1792"];
return width > height ? landscapeCandidates[0] : portraitCandidates[0];
}
export async function generateImage(
storyName: string,
storyConfig: StoryConfig,
@@ -15,13 +40,18 @@ export async function generateImage(
imageIndex: number
): Promise<string> {
const imagePath = path.join("stories", storyName, "images", `chunk_${chunkIndex}_img${imageIndex}.png`);
const prompt = "A cartoon cat.";
const prompt = `${(storyConfig.config.image_style_prompts || "").trim()}
Illustration for the following passage:
"${chunk.slice(0, 500)}"`;
const size = pickImageSize(storyConfig.config.export_settings?.resolution);
const response = await openai.images.generate({
model: "dall-e-3", // Downgrading to dall-e-3 as gpt-image-1 is not available
model: "dall-e-3",
prompt,
n: 1,
size: "1024x1024",
size,
response_format: "b64_json",
});

View File

@@ -12,11 +12,46 @@ import { createVideo } from "./video";
import { createSrt } from "./subtitles";
import { generateYouTubeMetadata, uploadToYouTube } from "./uploader";
import * as path from "path";
import * as fs from "fs";
async function mapWithConcurrency<T>(
items: T[],
limit: number,
mapper: (item: T, index: number) => Promise<void>
): Promise<void> {
if (items.length === 0) return;
let nextIndex = 0;
const inFlight: Promise<void>[] = [];
const launchNext = () => {
if (nextIndex >= items.length) return;
const current = nextIndex++;
const p = mapper(items[current], current).finally(() => {
const idx = inFlight.indexOf(p);
if (idx >= 0) inFlight.splice(idx, 1);
});
inFlight.push(p);
};
for (let i = 0; i < Math.min(limit, items.length); i++) {
launchNext();
}
while (inFlight.length > 0 || nextIndex < items.length) {
while (inFlight.length < limit && nextIndex < items.length) {
launchNext();
}
await Promise.race(inFlight);
}
}
async function main() {
const storyName = process.argv[2];
const rawArgs = process.argv.slice(2);
const storyName = rawArgs.find((a) => !a.startsWith("--"));
const force = rawArgs.includes("--force");
const skipUpload = rawArgs.includes("--skip-upload");
const concurrencyArg = rawArgs.find((a) => a.startsWith("--concurrency="));
const concurrency = concurrencyArg ? Math.max(1, parseInt(concurrencyArg.split("=")[1], 10) || 3) : 3;
if (!storyName) {
console.error("Please provide a story name.");
console.error("Usage: ts-node src/orchestrator.ts <storyName> [--force] [--skip-upload] [--concurrency=N]");
process.exit(1);
}
@@ -31,6 +66,9 @@ async function main() {
process.exit(1);
}
const storyRoot = path.resolve("stories", storyName);
["audio", "images", "final_audio", "video"].forEach((d) => fs.mkdirSync(path.join(storyRoot, d), { recursive: true }));
console.log("Sanitizing text...");
const sanitizedText = sanitizeText(storyName);
console.log("Sanitized text:");
@@ -44,30 +82,49 @@ async function main() {
console.log("Generating intro/outro audio...");
const introFile = path.join("stories", storyName, storyConfig.config.intro_audio_file);
const outroFile = path.join("stories", storyName, storyConfig.config.outro_audio_file);
if (!fs.existsSync(introFile) || force) {
await generateSingleAudio(storyConfig, "This is the intro.", introFile);
} else {
console.log(`Skipping intro generation, exists: ${introFile}`);
}
if (!fs.existsSync(outroFile) || force) {
await generateSingleAudio(storyConfig, "This is the outro.", outroFile);
console.log("Generated intro/outro audio successfully.");
} else {
console.log(`Skipping outro generation, exists: ${outroFile}`);
}
console.log("Intro/outro audio ready.");
console.log("Generating audio...");
const audioFiles: string[] = [];
const chunkDurations: number[] = [];
for (let i = 0; i < chunks.length; i++) {
console.log(`Generating ${chunks.length} audio chunks with concurrency=${concurrency}...`);
const audioFiles: string[] = new Array(chunks.length);
const chunkDurations: number[] = new Array(chunks.length);
await mapWithConcurrency(chunks, concurrency, async (chunk, i) => {
const audioPath = path.join("stories", storyName, "audio", `chunk_${i}.mp3`);
if (!fs.existsSync(audioPath) || force) {
console.log(`Generating audio for chunk ${i}...`);
const audioFile = await generateAudio(storyConfig, storyName, chunks[i], i);
audioFiles.push(audioFile);
const duration = await getChunkDuration(audioFile);
chunkDurations.push(duration);
console.log(`Generated audio file: ${audioFile}, duration: ${duration}`);
await generateAudio(storyConfig, storyName, chunk, i);
} else {
console.log(`Skipping audio for chunk ${i}, exists.`);
}
const duration = await getChunkDuration(audioPath);
audioFiles[i] = audioPath;
chunkDurations[i] = duration;
console.log(`Audio chunk ${i} ready: ${audioPath}, duration: ${duration}`);
});
console.log("Generating images...");
const imageFiles: string[] = [];
for (let i = 0; i < chunks.length; i++) {
console.log(`Generating ${chunks.length} images with concurrency=${concurrency}...`);
const imageFiles: string[] = new Array(chunks.length);
await mapWithConcurrency(chunks, concurrency, async (chunk, i) => {
const imagePath = path.join("stories", storyName, "images", `chunk_${i}_img0.png`);
if (!fs.existsSync(imagePath) || force) {
console.log(`Generating image for chunk ${i}...`);
const imageFile = await generateImage(storyName, storyConfig, chunks[i], i, 0);
imageFiles.push(imageFile);
console.log(`Generated image file: ${imageFile}`);
const generated = await generateImage(storyName, storyConfig, chunk, i, 0);
imageFiles[i] = generated;
} else {
console.log(`Skipping image for chunk ${i}, exists.`);
imageFiles[i] = imagePath;
}
console.log(`Image ${i} ready: ${imageFiles[i]}`);
});
console.log("Creating subtitles...");
const srtPath = createSrt(storyName, chunks, chunkDurations);
@@ -81,6 +138,11 @@ async function main() {
await createVideo(storyName, storyConfig, imageFiles, chunkDurations, srtPath);
console.log("Created video successfully.");
if (skipUpload) {
console.log("Skipping upload step (--skip-upload).");
return;
}
console.log("Generating YouTube metadata...");
const metadata = generateYouTubeMetadata(storyConfig);
console.log("YouTube metadata:");

View File

@@ -1,11 +1,14 @@
import * as fs from "fs";
import * as path from "path";
function toSrtTime(seconds: number): string {
const date = new Date(0);
date.setSeconds(seconds);
const timeString = date.toISOString().substr(11, 12);
return timeString.replace(".", ",");
function toSrtTime(secondsFloat: number): string {
const totalMs = Math.max(0, Math.round(secondsFloat * 1000));
const hours = Math.floor(totalMs / 3600000);
const minutes = Math.floor((totalMs % 3600000) / 60000);
const seconds = Math.floor((totalMs % 60000) / 1000);
const ms = totalMs % 1000;
const pad = (n: number, w: number) => n.toString().padStart(w, "0");
return `${pad(hours, 2)}:${pad(minutes, 2)}:${pad(seconds, 2)},${pad(ms, 3)}`;
}
export function createSrt(storyName: string, chunks: string[], chunkDurations: number[]): string {

View File

@@ -2,7 +2,14 @@ import { spawn } from "child_process";
import { StoryConfig } from "./config";
import * as path from "path";
const ffmpeg = require("ffmpeg-static");
import { getDuration } from "./audio";
function escapeForFilter(filePath: string): string {
return filePath
.replace(/\\/g, "\\\\")
.replace(/:/g, "\\:")
.replace(/,/g, "\\,")
.replace(/'/g, "\\'");
}
export async function createVideo(
storyName: string,
@@ -14,6 +21,7 @@ export async function createVideo(
const audioPath = path.resolve("stories", storyName, "final_audio", "final.mp3");
const videoPath = path.resolve("stories", storyName, "video", "final.mp4");
const totalDuration = chunkDurations.reduce((a, b) => a + b, 0);
const resolution = storyConfig.config.export_settings?.resolution || "1024x1024";
const inputs = imageFiles.map((file) => ["-loop", "1", "-i", file]).flat();
inputs.push("-i", audioPath);
@@ -21,15 +29,15 @@ export async function createVideo(
const filterGraph = imageFiles
.map((_, i) => {
const duration = chunkDurations[i];
const zoompan = `zoompan=z='min(zoom+0.0015,1.5)':d=${
25 * duration
}:x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':s=1024x1024`;
const zoompan = `zoompan=z='min(zoom+0.0015,1.5)':d=${25 * duration}:x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':s=${resolution}`;
return `[${i}:v]${zoompan},fade=t=out:st=${duration - 1}:d=1[v${i}]`;
})
.join(";");
const streamSpecifiers = imageFiles.map((_, i) => `[v${i}]`).join("");
const finalFilterGraph = `${filterGraph};${streamSpecifiers}concat=n=${imageFiles.length}:v=1:a=0,format=yuv420p[v];[v]subtitles=${srtPath}[v]`;
const escapedSrt = escapeForFilter(srtPath);
const concatGraph = `${filterGraph};${streamSpecifiers}concat=n=${imageFiles.length}:v=1:a=0,format=yuv420p[v0]`;
const finalFilterGraph = `${concatGraph};[v0]subtitles='${escapedSrt}'[v]`;
const args = [
"-y",