Voice Commands with Laravel: Integrating Whisper and GPT
·
13 min read
Introduction
Voice interfaces are becoming a trend in modern UX. From virtual assistants to voice search, the ability to interact with voice provides a more natural and convenient experience.
This article guides you through integrating:
- OpenAI Whisper: Accurate speech-to-text
- GPT: Natural language command processing
- TTS (Text-to-Speech): Voice responses
- Realtime: WebSocket for voice chat
Use Cases
- Voice search in applications
- Voice-controlled admin panels
- Accessibility features
- Voice notes and transcription
- Customer support chatbots
- Voice commands for IoT/smart home
Architecture
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
│ Browser │ │ Laravel │ │ OpenAI │
│ (Audio) │─────▶│ API │─────▶│ Whisper │
└──────────────┘ └──────────────┘ └──────────────┘
│ │ │
│ ▼ │
│ ┌──────────────┐ │
│ │ Command │ │
│ │ Parser │◀────────────┘
│ └──────────────┘
│ │
│ ▼
│ ┌──────────────┐
│ │ Execute │
│ │ Action │
│ └──────────────┘
│ │
│ ▼
│ ┌──────────────┐
◀──────────────│ TTS │
│ Response │
└──────────────┘
Setup
Dependencies
composer require openai-php/laravel
npm install recordrtc wavesurfer.js
Configuration
// config/services.php
return [
'openai' => [
'api_key' => env('OPENAI_API_KEY'),
'whisper_model' => env('WHISPER_MODEL', 'whisper-1'),
'tts_model' => env('TTS_MODEL', 'tts-1'),
'tts_voice' => env('TTS_VOICE', 'alloy'),
],
];
# .env
OPENAI_API_KEY=sk-...
WHISPER_MODEL=whisper-1
TTS_MODEL=tts-1
TTS_VOICE=alloy
Speech-to-Text with Whisper
Voice Transcription Service
// app/Services/Voice/TranscriptionService.php
namespace App\Services\Voice;
use OpenAI\Laravel\Facades\OpenAI;
use Illuminate\Http\UploadedFile;
use Illuminate\Support\Facades\Storage;
class TranscriptionService
{
public function transcribe(UploadedFile $audioFile, ?string $language = null): array
{
// Store temporarily
$path = $audioFile->store('temp/audio', 'local');
$fullPath = Storage::disk('local')->path($path);
try {
$response = OpenAI::audio()->transcribe([
'model' => config('services.openai.whisper_model'),
'file' => fopen($fullPath, 'r'),
'language' => $language,
'response_format' => 'verbose_json',
'timestamp_granularities' => ['word', 'segment'],
]);
return [
'success' => true,
'text' => $response->text,
'language' => $response->language,
'duration' => $response->duration,
'segments' => $response->segments ?? [],
'words' => $response->words ?? [],
];
} finally {
// Clean up
Storage::disk('local')->delete($path);
}
}
public function transcribeFromUrl(string $audioUrl): array
{
// Download audio
$content = file_get_contents($audioUrl);
$tempPath = tempnam(sys_get_temp_dir(), 'audio_') . '.mp3';
file_put_contents($tempPath, $content);
try {
$response = OpenAI::audio()->transcribe([
'model' => config('services.openai.whisper_model'),
'file' => fopen($tempPath, 'r'),
'response_format' => 'json',
]);
return [
'success' => true,
'text' => $response->text,
];
} finally {
unlink($tempPath);
}
}
public function translate(UploadedFile $audioFile): array
{
// Translate any language to English
$path = $audioFile->store('temp/audio', 'local');
$fullPath = Storage::disk('local')->path($path);
try {
$response = OpenAI::audio()->translate([
'model' => config('services.openai.whisper_model'),
'file' => fopen($fullPath, 'r'),
'response_format' => 'json',
]);
return [
'success' => true,
'text' => $response->text,
];
} finally {
Storage::disk('local')->delete($path);
}
}
}
Voice Command Controller
// app/Http/Controllers/VoiceController.php
namespace App\Http\Controllers;
use App\Services\Voice\TranscriptionService;
use App\Services\Voice\CommandProcessor;
use App\Services\Voice\TextToSpeechService;
use Illuminate\Http\Request;
use Illuminate\Http\JsonResponse;
class VoiceController extends Controller
{
public function __construct(
private TranscriptionService $transcription,
private CommandProcessor $commandProcessor,
private TextToSpeechService $tts,
) {}
public function transcribe(Request $request): JsonResponse
{
$request->validate([
'audio' => 'required|file|mimes:mp3,wav,webm,m4a,ogg|max:25000',
'language' => 'nullable|string|size:2',
]);
$result = $this->transcription->transcribe(
$request->file('audio'),
$request->input('language')
);
return response()->json($result);
}
public function command(Request $request): JsonResponse
{
$request->validate([
'audio' => 'required|file|mimes:mp3,wav,webm,m4a,ogg|max:25000',
]);
// 1. Transcribe audio to text
$transcription = $this->transcription->transcribe($request->file('audio'));
if (!$transcription['success']) {
return response()->json(['error' => 'Transcription failed'], 500);
}
// 2. Process command
$commandResult = $this->commandProcessor->process($transcription['text']);
// 3. Generate voice response (optional)
$audioResponse = null;
if ($request->boolean('voice_response')) {
$audioResponse = $this->tts->generate($commandResult['response']);
}
return response()->json([
'transcription' => $transcription['text'],
'command' => $commandResult['command'],
'response' => $commandResult['response'],
'audio_response' => $audioResponse,
]);
}
}
Command Processing
Natural Language Command Parser
// app/Services/Voice/CommandProcessor.php
namespace App\Services\Voice;
use OpenAI\Laravel\Facades\OpenAI;
use App\Services\Voice\Commands\CommandInterface;
use Illuminate\Support\Facades\Log;
class CommandProcessor
{
private array $commands = [];
public function __construct()
{
$this->registerCommands();
}
public function process(string $input): array
{
// Parse intent with GPT
$intent = $this->parseIntent($input);
Log::info('Voice command processed', [
'input' => $input,
'intent' => $intent,
]);
// Execute command
if (isset($this->commands[$intent['command']])) {
$command = $this->commands[$intent['command']];
$result = $command->execute($intent['parameters']);
return [
'command' => $intent['command'],
'parameters' => $intent['parameters'],
'result' => $result,
'response' => $this->generateResponse($intent['command'], $result),
];
}
return [
'command' => 'unknown',
'response' => $this->handleUnknownCommand($input),
];
}
protected function parseIntent(string $input): array
{
$response = OpenAI::chat()->create([
'model' => 'gpt-4o',
'messages' => [
[
'role' => 'system',
'content' => $this->getIntentParsingPrompt()
],
[
'role' => 'user',
'content' => $input
]
],
'response_format' => ['type' => 'json_object'],
'temperature' => 0.3,
]);
return json_decode($response->choices[0]->message->content, true);
}
protected function getIntentParsingPrompt(): string
{
$commandList = implode("\n", array_map(
fn($cmd, $handler) => "- {$cmd}: {$handler->getDescription()}",
array_keys($this->commands),
$this->commands
));
return <<<PROMPT
You are a voice command parser. Parse the user's voice input and extract:
1. The command they want to execute
2. Any parameters for that command
Available commands:
{$commandList}
Respond in JSON format:
{
"command": "command_name",
"parameters": {
"param1": "value1"
},
"confidence": 0.95
}
If the command is unclear, use "unknown" as the command.
PROMPT;
}
protected function registerCommands(): void
{
$this->commands = [
'search' => new Commands\SearchCommand(),
'create_note' => new Commands\CreateNoteCommand(),
'send_email' => new Commands\SendEmailCommand(),
'show_dashboard' => new Commands\ShowDashboardCommand(),
'navigate' => new Commands\NavigateCommand(),
'play_music' => new Commands\PlayMusicCommand(),
'set_reminder' => new Commands\SetReminderCommand(),
];
}
protected function generateResponse(string $command, mixed $result): string
{
$templates = [
'search' => "I found {count} results for \"{query}\"",
'create_note' => "Created a note with title \"{title}\"",
'send_email' => "Email sent to {recipient}",
'navigate' => "Navigating to {page}",
'set_reminder' => "Reminder set for {time}",
];
if (isset($templates[$command])) {
return $this->interpolate($templates[$command], $result);
}
return "Executed command {$command}";
}
protected function handleUnknownCommand(string $input): string
{
// Use GPT to generate helpful response
$response = OpenAI::chat()->create([
'model' => 'gpt-4o',
'messages' => [
[
'role' => 'system',
'content' => 'You are a helpful assistant. The user said something that is not a recognized command. Politely explain what commands are available and ask them to try again.'
],
[
'role' => 'user',
'content' => $input
]
],
]);
return $response->choices[0]->message->content;
}
private function interpolate(string $template, array $data): string
{
foreach ($data as $key => $value) {
$template = str_replace("{{$key}}", $value, $template);
}
return $template;
}
}
Example Command Implementation
// app/Services/Voice/Commands/SearchCommand.php
namespace App\Services\Voice\Commands;
use App\Models\Post;
class SearchCommand implements CommandInterface
{
public function getDescription(): string
{
return "Search for content. Parameters: query (string)";
}
public function execute(array $parameters): array
{
$query = $parameters['query'] ?? '';
$results = Post::search($query)
->take(10)
->get();
return [
'query' => $query,
'count' => $results->count(),
'results' => $results->map(fn($post) => [
'id' => $post->id,
'title' => $post->title,
'url' => route('posts.show', $post),
])->toArray(),
];
}
}
// app/Services/Voice/Commands/CreateNoteCommand.php
namespace App\Services\Voice\Commands;
use App\Models\Note;
use Illuminate\Support\Facades\Auth;
class CreateNoteCommand implements CommandInterface
{
public function getDescription(): string
{
return "Create a new note. Parameters: title, content";
}
public function execute(array $parameters): array
{
$note = Note::create([
'user_id' => Auth::id(),
'title' => $parameters['title'] ?? 'Untitled Note',
'content' => $parameters['content'] ?? '',
]);
return [
'id' => $note->id,
'title' => $note->title,
];
}
}
Text-to-Speech
TTS Service
// app/Services/Voice/TextToSpeechService.php
namespace App\Services\Voice;
use OpenAI\Laravel\Facades\OpenAI;
use Illuminate\Support\Facades\Storage;
use Illuminate\Support\Str;
class TextToSpeechService
{
public function generate(
string $text,
string $voice = null,
string $model = null
): array {
$voice = $voice ?? config('services.openai.tts_voice', 'alloy');
$model = $model ?? config('services.openai.tts_model', 'tts-1');
$response = OpenAI::audio()->speech([
'model' => $model,
'voice' => $voice,
'input' => $text,
'response_format' => 'mp3',
]);
// Save audio file
$filename = 'tts/' . Str::uuid() . '.mp3';
Storage::disk('public')->put($filename, $response);
return [
'url' => Storage::disk('public')->url($filename),
'path' => $filename,
'text' => $text,
'voice' => $voice,
];
}
public function stream(string $text, string $voice = 'alloy'): \Generator
{
$response = OpenAI::audio()->speechStreamed([
'model' => 'tts-1',
'voice' => $voice,
'input' => $text,
]);
foreach ($response as $chunk) {
yield $chunk;
}
}
public function getAvailableVoices(): array
{
return [
'alloy' => 'Neutral, balanced voice',
'echo' => 'Warm, engaging voice',
'fable' => 'British accent, narrative style',
'onyx' => 'Deep, authoritative voice',
'nova' => 'Friendly, conversational voice',
'shimmer' => 'Clear, expressive voice',
];
}
}
Streaming TTS Controller
// app/Http/Controllers/TTSController.php
namespace App\Http\Controllers;
use App\Services\Voice\TextToSpeechService;
use Illuminate\Http\Request;
use Symfony\Component\HttpFoundation\StreamedResponse;
class TTSController extends Controller
{
public function __construct(
private TextToSpeechService $tts
) {}
public function generate(Request $request)
{
$request->validate([
'text' => 'required|string|max:4096',
'voice' => 'nullable|string|in:alloy,echo,fable,onyx,nova,shimmer',
]);
$result = $this->tts->generate(
$request->input('text'),
$request->input('voice', 'alloy')
);
return response()->json($result);
}
public function stream(Request $request): StreamedResponse
{
$request->validate([
'text' => 'required|string|max:4096',
'voice' => 'nullable|string',
]);
return response()->stream(function () use ($request) {
foreach ($this->tts->stream($request->input('text'), $request->input('voice', 'alloy')) as $chunk) {
echo $chunk;
ob_flush();
flush();
}
}, 200, [
'Content-Type' => 'audio/mpeg',
'Cache-Control' => 'no-cache',
]);
}
}
Frontend Integration
Voice Recorder Component
// resources/js/components/VoiceRecorder.js
import RecordRTC from 'recordrtc';
export default class VoiceRecorder {
constructor(options = {}) {
this.options = {
onTranscription: () => {},
onError: () => {},
onRecordingStart: () => {},
onRecordingStop: () => {},
...options
};
this.recorder = null;
this.stream = null;
this.isRecording = false;
}
async start() {
try {
this.stream = await navigator.mediaDevices.getUserMedia({
audio: {
echoCancellation: true,
noiseSuppression: true,
sampleRate: 16000,
}
});
this.recorder = new RecordRTC(this.stream, {
type: 'audio',
mimeType: 'audio/webm',
recorderType: RecordRTC.StereoAudioRecorder,
numberOfAudioChannels: 1,
desiredSampRate: 16000,
});
this.recorder.startRecording();
this.isRecording = true;
this.options.onRecordingStart();
} catch (error) {
this.options.onError(error);
}
}
async stop() {
return new Promise((resolve) => {
this.recorder.stopRecording(async () => {
const blob = this.recorder.getBlob();
this.isRecording = false;
this.options.onRecordingStop();
// Stop all tracks
this.stream.getTracks().forEach(track => track.stop());
// Send to server
const result = await this.transcribe(blob);
this.options.onTranscription(result);
resolve(result);
});
});
}
async transcribe(blob) {
const formData = new FormData();
formData.append('audio', blob, 'recording.webm');
const response = await fetch('/api/voice/transcribe', {
method: 'POST',
headers: {
'X-CSRF-TOKEN': document.querySelector('meta[name="csrf-token"]').content,
},
body: formData
});
return response.json();
}
async sendCommand(blob) {
const formData = new FormData();
formData.append('audio', blob, 'command.webm');
formData.append('voice_response', 'true');
const response = await fetch('/api/voice/command', {
method: 'POST',
headers: {
'X-CSRF-TOKEN': document.querySelector('meta[name="csrf-token"]').content,
},
body: formData
});
return response.json();
}
}
Voice Command Button Component
<!-- resources/js/components/VoiceCommandButton.vue -->
<template>
<div class="voice-command">
<button
@mousedown="startRecording"
@mouseup="stopRecording"
@touchstart="startRecording"
@touchend="stopRecording"
:class="{ 'recording': isRecording }"
class="voice-btn"
>
<svg v-if="!isRecording" class="w-8 h-8" fill="currentColor" viewBox="0 0 24 24">
<path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3z"/>
<path d="M17 11c0 2.76-2.24 5-5 5s-5-2.24-5-5H5c0 3.53 2.61 6.43 6 6.92V21h2v-3.08c3.39-.49 6-3.39 6-6.92h-2z"/>
</svg>
<span v-else class="recording-indicator">
<span class="pulse"></span>
</span>
</button>
<div v-if="transcription" class="transcription-result">
<p class="text-sm text-gray-600">{{ transcription }}</p>
</div>
<div v-if="response" class="response">
<p>{{ response }}</p>
<audio v-if="audioUrl" :src="audioUrl" autoplay></audio>
</div>
</div>
</template>
<script>
import VoiceRecorder from './VoiceRecorder';
export default {
data() {
return {
isRecording: false,
transcription: null,
response: null,
audioUrl: null,
recorder: null,
};
},
mounted() {
this.recorder = new VoiceRecorder({
onRecordingStart: () => {
this.isRecording = true;
this.transcription = null;
this.response = null;
},
onRecordingStop: () => {
this.isRecording = false;
},
onTranscription: (result) => {
this.transcription = result.transcription;
this.response = result.response;
this.audioUrl = result.audio_response?.url;
},
onError: (error) => {
console.error('Recording error:', error);
alert('Could not access microphone');
}
});
},
methods: {
startRecording() {
this.recorder.start();
},
async stopRecording() {
await this.recorder.stop();
}
}
};
</script>
<style scoped>
.voice-btn {
@apply w-16 h-16 rounded-full bg-blue-500 text-white flex items-center justify-center;
@apply hover:bg-blue-600 transition-all;
}
.voice-btn.recording {
@apply bg-red-500 animate-pulse;
}
.recording-indicator {
@apply relative w-4 h-4;
}
.pulse {
@apply absolute w-full h-full bg-white rounded-full;
animation: pulse 1s infinite;
}
@keyframes pulse {
0% { transform: scale(1); opacity: 1; }
50% { transform: scale(1.5); opacity: 0.5; }
100% { transform: scale(1); opacity: 1; }
}
</style>
Realtime Voice Chat
WebSocket Integration
// app/Events/VoiceMessage.php
namespace App\Events;
use Illuminate\Broadcasting\Channel;
use Illuminate\Broadcasting\InteractsWithSockets;
use Illuminate\Contracts\Broadcasting\ShouldBroadcast;
class VoiceMessage implements ShouldBroadcast
{
use InteractsWithSockets;
public function __construct(
public string $roomId,
public string $userId,
public string $audioUrl,
public string $transcription,
) {}
public function broadcastOn(): Channel
{
return new Channel("voice-room.{$this->roomId}");
}
}
// app/Http/Controllers/VoiceChatController.php
namespace App\Http\Controllers;
use App\Events\VoiceMessage;
use App\Services\Voice\TranscriptionService;
use Illuminate\Http\Request;
use Illuminate\Support\Facades\Storage;
class VoiceChatController extends Controller
{
public function __construct(
private TranscriptionService $transcription
) {}
public function sendMessage(Request $request, string $roomId)
{
$request->validate([
'audio' => 'required|file|mimes:webm,mp3,wav|max:10000',
]);
// Save audio
$path = $request->file('audio')->store("voice-messages/{$roomId}", 'public');
$audioUrl = Storage::disk('public')->url($path);
// Transcribe
$result = $this->transcription->transcribe($request->file('audio'));
// Broadcast to room
event(new VoiceMessage(
roomId: $roomId,
userId: auth()->id(),
audioUrl: $audioUrl,
transcription: $result['text'],
));
return response()->json([
'success' => true,
'audio_url' => $audioUrl,
'transcription' => $result['text'],
]);
}
}
Conclusion
Voice interfaces open up many new possibilities for Laravel applications:
- Accessibility: Help users with disabilities use apps more easily
- Convenience: Faster interaction than typing
- Natural: Communicate naturally as with real people
- Multitasking: Use when hands are busy
Best Practices
- Clear feedback: Let users know when recording
- Error handling: Handle cases when audio is unclear
- Privacy: Notify users about audio recording
- Fallback: Always have text input option