Voice Commands with Laravel: Integrating Whisper and GPT

· 13 min read

Introduction

Voice interfaces are becoming a trend in modern UX. From virtual assistants to voice search, the ability to interact with voice provides a more natural and convenient experience.

This article guides you through integrating:

  • OpenAI Whisper: Accurate speech-to-text
  • GPT: Natural language command processing
  • TTS (Text-to-Speech): Voice responses
  • Realtime: WebSocket for voice chat

Use Cases

  • Voice search in applications
  • Voice-controlled admin panels
  • Accessibility features
  • Voice notes and transcription
  • Customer support chatbots
  • Voice commands for IoT/smart home

Architecture

┌──────────────┐      ┌──────────────┐      ┌──────────────┐
│   Browser    │      │   Laravel    │      │   OpenAI     │
│   (Audio)    │─────▶│    API       │─────▶│   Whisper    │
└──────────────┘      └──────────────┘      └──────────────┘
       │                     │                     │
       │                     ▼                     │
       │              ┌──────────────┐             │
       │              │   Command    │             │
       │              │   Parser     │◀────────────┘
       │              └──────────────┘
       │                     │
       │                     ▼
       │              ┌──────────────┐
       │              │   Execute    │
       │              │   Action     │
       │              └──────────────┘
       │                     │
       │                     ▼
       │              ┌──────────────┐
       ◀──────────────│    TTS       │
                      │   Response   │
                      └──────────────┘

Setup

Dependencies

composer require openai-php/laravel
npm install recordrtc wavesurfer.js

Configuration

// config/services.php
return [
    'openai' => [
        'api_key' => env('OPENAI_API_KEY'),
        'whisper_model' => env('WHISPER_MODEL', 'whisper-1'),
        'tts_model' => env('TTS_MODEL', 'tts-1'),
        'tts_voice' => env('TTS_VOICE', 'alloy'),
    ],
];
# .env
OPENAI_API_KEY=sk-...
WHISPER_MODEL=whisper-1
TTS_MODEL=tts-1
TTS_VOICE=alloy

Speech-to-Text with Whisper

Voice Transcription Service

// app/Services/Voice/TranscriptionService.php
namespace App\Services\Voice;

use OpenAI\Laravel\Facades\OpenAI;
use Illuminate\Http\UploadedFile;
use Illuminate\Support\Facades\Storage;

class TranscriptionService
{
    public function transcribe(UploadedFile $audioFile, ?string $language = null): array
    {
        // Store temporarily
        $path = $audioFile->store('temp/audio', 'local');
        $fullPath = Storage::disk('local')->path($path);
        
        try {
            $response = OpenAI::audio()->transcribe([
                'model' => config('services.openai.whisper_model'),
                'file' => fopen($fullPath, 'r'),
                'language' => $language,
                'response_format' => 'verbose_json',
                'timestamp_granularities' => ['word', 'segment'],
            ]);
            
            return [
                'success' => true,
                'text' => $response->text,
                'language' => $response->language,
                'duration' => $response->duration,
                'segments' => $response->segments ?? [],
                'words' => $response->words ?? [],
            ];
        } finally {
            // Clean up
            Storage::disk('local')->delete($path);
        }
    }
    
    public function transcribeFromUrl(string $audioUrl): array
    {
        // Download audio
        $content = file_get_contents($audioUrl);
        $tempPath = tempnam(sys_get_temp_dir(), 'audio_') . '.mp3';
        file_put_contents($tempPath, $content);
        
        try {
            $response = OpenAI::audio()->transcribe([
                'model' => config('services.openai.whisper_model'),
                'file' => fopen($tempPath, 'r'),
                'response_format' => 'json',
            ]);
            
            return [
                'success' => true,
                'text' => $response->text,
            ];
        } finally {
            unlink($tempPath);
        }
    }
    
    public function translate(UploadedFile $audioFile): array
    {
        // Translate any language to English
        $path = $audioFile->store('temp/audio', 'local');
        $fullPath = Storage::disk('local')->path($path);
        
        try {
            $response = OpenAI::audio()->translate([
                'model' => config('services.openai.whisper_model'),
                'file' => fopen($fullPath, 'r'),
                'response_format' => 'json',
            ]);
            
            return [
                'success' => true,
                'text' => $response->text,
            ];
        } finally {
            Storage::disk('local')->delete($path);
        }
    }
}

Voice Command Controller

// app/Http/Controllers/VoiceController.php
namespace App\Http\Controllers;

use App\Services\Voice\TranscriptionService;
use App\Services\Voice\CommandProcessor;
use App\Services\Voice\TextToSpeechService;
use Illuminate\Http\Request;
use Illuminate\Http\JsonResponse;

class VoiceController extends Controller
{
    public function __construct(
        private TranscriptionService $transcription,
        private CommandProcessor $commandProcessor,
        private TextToSpeechService $tts,
    ) {}
    
    public function transcribe(Request $request): JsonResponse
    {
        $request->validate([
            'audio' => 'required|file|mimes:mp3,wav,webm,m4a,ogg|max:25000',
            'language' => 'nullable|string|size:2',
        ]);
        
        $result = $this->transcription->transcribe(
            $request->file('audio'),
            $request->input('language')
        );
        
        return response()->json($result);
    }
    
    public function command(Request $request): JsonResponse
    {
        $request->validate([
            'audio' => 'required|file|mimes:mp3,wav,webm,m4a,ogg|max:25000',
        ]);
        
        // 1. Transcribe audio to text
        $transcription = $this->transcription->transcribe($request->file('audio'));
        
        if (!$transcription['success']) {
            return response()->json(['error' => 'Transcription failed'], 500);
        }
        
        // 2. Process command
        $commandResult = $this->commandProcessor->process($transcription['text']);
        
        // 3. Generate voice response (optional)
        $audioResponse = null;
        if ($request->boolean('voice_response')) {
            $audioResponse = $this->tts->generate($commandResult['response']);
        }
        
        return response()->json([
            'transcription' => $transcription['text'],
            'command' => $commandResult['command'],
            'response' => $commandResult['response'],
            'audio_response' => $audioResponse,
        ]);
    }
}

Command Processing

Natural Language Command Parser

// app/Services/Voice/CommandProcessor.php
namespace App\Services\Voice;

use OpenAI\Laravel\Facades\OpenAI;
use App\Services\Voice\Commands\CommandInterface;
use Illuminate\Support\Facades\Log;

class CommandProcessor
{
    private array $commands = [];
    
    public function __construct()
    {
        $this->registerCommands();
    }
    
    public function process(string $input): array
    {
        // Parse intent with GPT
        $intent = $this->parseIntent($input);
        
        Log::info('Voice command processed', [
            'input' => $input,
            'intent' => $intent,
        ]);
        
        // Execute command
        if (isset($this->commands[$intent['command']])) {
            $command = $this->commands[$intent['command']];
            $result = $command->execute($intent['parameters']);
            
            return [
                'command' => $intent['command'],
                'parameters' => $intent['parameters'],
                'result' => $result,
                'response' => $this->generateResponse($intent['command'], $result),
            ];
        }
        
        return [
            'command' => 'unknown',
            'response' => $this->handleUnknownCommand($input),
        ];
    }
    
    protected function parseIntent(string $input): array
    {
        $response = OpenAI::chat()->create([
            'model' => 'gpt-4o',
            'messages' => [
                [
                    'role' => 'system',
                    'content' => $this->getIntentParsingPrompt()
                ],
                [
                    'role' => 'user',
                    'content' => $input
                ]
            ],
            'response_format' => ['type' => 'json_object'],
            'temperature' => 0.3,
        ]);
        
        return json_decode($response->choices[0]->message->content, true);
    }
    
    protected function getIntentParsingPrompt(): string
    {
        $commandList = implode("\n", array_map(
            fn($cmd, $handler) => "- {$cmd}: {$handler->getDescription()}",
            array_keys($this->commands),
            $this->commands
        ));
        
        return <<<PROMPT
You are a voice command parser. Parse the user's voice input and extract:
1. The command they want to execute
2. Any parameters for that command

Available commands:
{$commandList}

Respond in JSON format:
{
    "command": "command_name",
    "parameters": {
        "param1": "value1"
    },
    "confidence": 0.95
}

If the command is unclear, use "unknown" as the command.
PROMPT;
    }
    
    protected function registerCommands(): void
    {
        $this->commands = [
            'search' => new Commands\SearchCommand(),
            'create_note' => new Commands\CreateNoteCommand(),
            'send_email' => new Commands\SendEmailCommand(),
            'show_dashboard' => new Commands\ShowDashboardCommand(),
            'navigate' => new Commands\NavigateCommand(),
            'play_music' => new Commands\PlayMusicCommand(),
            'set_reminder' => new Commands\SetReminderCommand(),
        ];
    }
    
    protected function generateResponse(string $command, mixed $result): string
    {
        $templates = [
            'search' => "I found {count} results for \"{query}\"",
            'create_note' => "Created a note with title \"{title}\"",
            'send_email' => "Email sent to {recipient}",
            'navigate' => "Navigating to {page}",
            'set_reminder' => "Reminder set for {time}",
        ];
        
        if (isset($templates[$command])) {
            return $this->interpolate($templates[$command], $result);
        }
        
        return "Executed command {$command}";
    }
    
    protected function handleUnknownCommand(string $input): string
    {
        // Use GPT to generate helpful response
        $response = OpenAI::chat()->create([
            'model' => 'gpt-4o',
            'messages' => [
                [
                    'role' => 'system',
                    'content' => 'You are a helpful assistant. The user said something that is not a recognized command. Politely explain what commands are available and ask them to try again.'
                ],
                [
                    'role' => 'user',
                    'content' => $input
                ]
            ],
        ]);
        
        return $response->choices[0]->message->content;
    }
    
    private function interpolate(string $template, array $data): string
    {
        foreach ($data as $key => $value) {
            $template = str_replace("{{$key}}", $value, $template);
        }
        return $template;
    }
}

Example Command Implementation

// app/Services/Voice/Commands/SearchCommand.php
namespace App\Services\Voice\Commands;

use App\Models\Post;

class SearchCommand implements CommandInterface
{
    public function getDescription(): string
    {
        return "Search for content. Parameters: query (string)";
    }
    
    public function execute(array $parameters): array
    {
        $query = $parameters['query'] ?? '';
        
        $results = Post::search($query)
            ->take(10)
            ->get();
        
        return [
            'query' => $query,
            'count' => $results->count(),
            'results' => $results->map(fn($post) => [
                'id' => $post->id,
                'title' => $post->title,
                'url' => route('posts.show', $post),
            ])->toArray(),
        ];
    }
}
// app/Services/Voice/Commands/CreateNoteCommand.php
namespace App\Services\Voice\Commands;

use App\Models\Note;
use Illuminate\Support\Facades\Auth;

class CreateNoteCommand implements CommandInterface
{
    public function getDescription(): string
    {
        return "Create a new note. Parameters: title, content";
    }
    
    public function execute(array $parameters): array
    {
        $note = Note::create([
            'user_id' => Auth::id(),
            'title' => $parameters['title'] ?? 'Untitled Note',
            'content' => $parameters['content'] ?? '',
        ]);
        
        return [
            'id' => $note->id,
            'title' => $note->title,
        ];
    }
}

Text-to-Speech

TTS Service

// app/Services/Voice/TextToSpeechService.php
namespace App\Services\Voice;

use OpenAI\Laravel\Facades\OpenAI;
use Illuminate\Support\Facades\Storage;
use Illuminate\Support\Str;

class TextToSpeechService
{
    public function generate(
        string $text, 
        string $voice = null,
        string $model = null
    ): array {
        $voice = $voice ?? config('services.openai.tts_voice', 'alloy');
        $model = $model ?? config('services.openai.tts_model', 'tts-1');
        
        $response = OpenAI::audio()->speech([
            'model' => $model,
            'voice' => $voice,
            'input' => $text,
            'response_format' => 'mp3',
        ]);
        
        // Save audio file
        $filename = 'tts/' . Str::uuid() . '.mp3';
        Storage::disk('public')->put($filename, $response);
        
        return [
            'url' => Storage::disk('public')->url($filename),
            'path' => $filename,
            'text' => $text,
            'voice' => $voice,
        ];
    }
    
    public function stream(string $text, string $voice = 'alloy'): \Generator
    {
        $response = OpenAI::audio()->speechStreamed([
            'model' => 'tts-1',
            'voice' => $voice,
            'input' => $text,
        ]);
        
        foreach ($response as $chunk) {
            yield $chunk;
        }
    }
    
    public function getAvailableVoices(): array
    {
        return [
            'alloy' => 'Neutral, balanced voice',
            'echo' => 'Warm, engaging voice',
            'fable' => 'British accent, narrative style',
            'onyx' => 'Deep, authoritative voice',
            'nova' => 'Friendly, conversational voice',
            'shimmer' => 'Clear, expressive voice',
        ];
    }
}

Streaming TTS Controller

// app/Http/Controllers/TTSController.php
namespace App\Http\Controllers;

use App\Services\Voice\TextToSpeechService;
use Illuminate\Http\Request;
use Symfony\Component\HttpFoundation\StreamedResponse;

class TTSController extends Controller
{
    public function __construct(
        private TextToSpeechService $tts
    ) {}
    
    public function generate(Request $request)
    {
        $request->validate([
            'text' => 'required|string|max:4096',
            'voice' => 'nullable|string|in:alloy,echo,fable,onyx,nova,shimmer',
        ]);
        
        $result = $this->tts->generate(
            $request->input('text'),
            $request->input('voice', 'alloy')
        );
        
        return response()->json($result);
    }
    
    public function stream(Request $request): StreamedResponse
    {
        $request->validate([
            'text' => 'required|string|max:4096',
            'voice' => 'nullable|string',
        ]);
        
        return response()->stream(function () use ($request) {
            foreach ($this->tts->stream($request->input('text'), $request->input('voice', 'alloy')) as $chunk) {
                echo $chunk;
                ob_flush();
                flush();
            }
        }, 200, [
            'Content-Type' => 'audio/mpeg',
            'Cache-Control' => 'no-cache',
        ]);
    }
}

Frontend Integration

Voice Recorder Component

// resources/js/components/VoiceRecorder.js
import RecordRTC from 'recordrtc';

export default class VoiceRecorder {
    constructor(options = {}) {
        this.options = {
            onTranscription: () => {},
            onError: () => {},
            onRecordingStart: () => {},
            onRecordingStop: () => {},
            ...options
        };
        
        this.recorder = null;
        this.stream = null;
        this.isRecording = false;
    }
    
    async start() {
        try {
            this.stream = await navigator.mediaDevices.getUserMedia({ 
                audio: {
                    echoCancellation: true,
                    noiseSuppression: true,
                    sampleRate: 16000,
                }
            });
            
            this.recorder = new RecordRTC(this.stream, {
                type: 'audio',
                mimeType: 'audio/webm',
                recorderType: RecordRTC.StereoAudioRecorder,
                numberOfAudioChannels: 1,
                desiredSampRate: 16000,
            });
            
            this.recorder.startRecording();
            this.isRecording = true;
            this.options.onRecordingStart();
            
        } catch (error) {
            this.options.onError(error);
        }
    }
    
    async stop() {
        return new Promise((resolve) => {
            this.recorder.stopRecording(async () => {
                const blob = this.recorder.getBlob();
                this.isRecording = false;
                this.options.onRecordingStop();
                
                // Stop all tracks
                this.stream.getTracks().forEach(track => track.stop());
                
                // Send to server
                const result = await this.transcribe(blob);
                this.options.onTranscription(result);
                
                resolve(result);
            });
        });
    }
    
    async transcribe(blob) {
        const formData = new FormData();
        formData.append('audio', blob, 'recording.webm');
        
        const response = await fetch('/api/voice/transcribe', {
            method: 'POST',
            headers: {
                'X-CSRF-TOKEN': document.querySelector('meta[name="csrf-token"]').content,
            },
            body: formData
        });
        
        return response.json();
    }
    
    async sendCommand(blob) {
        const formData = new FormData();
        formData.append('audio', blob, 'command.webm');
        formData.append('voice_response', 'true');
        
        const response = await fetch('/api/voice/command', {
            method: 'POST',
            headers: {
                'X-CSRF-TOKEN': document.querySelector('meta[name="csrf-token"]').content,
            },
            body: formData
        });
        
        return response.json();
    }
}

Voice Command Button Component

<!-- resources/js/components/VoiceCommandButton.vue -->
<template>
    <div class="voice-command">
        <button 
            @mousedown="startRecording"
            @mouseup="stopRecording"
            @touchstart="startRecording"
            @touchend="stopRecording"
            :class="{ 'recording': isRecording }"
            class="voice-btn"
        >
            <svg v-if="!isRecording" class="w-8 h-8" fill="currentColor" viewBox="0 0 24 24">
                <path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3z"/>
                <path d="M17 11c0 2.76-2.24 5-5 5s-5-2.24-5-5H5c0 3.53 2.61 6.43 6 6.92V21h2v-3.08c3.39-.49 6-3.39 6-6.92h-2z"/>
            </svg>
            <span v-else class="recording-indicator">
                <span class="pulse"></span>
            </span>
        </button>
        
        <div v-if="transcription" class="transcription-result">
            <p class="text-sm text-gray-600">{{ transcription }}</p>
        </div>
        
        <div v-if="response" class="response">
            <p>{{ response }}</p>
            <audio v-if="audioUrl" :src="audioUrl" autoplay></audio>
        </div>
    </div>
</template>

<script>
import VoiceRecorder from './VoiceRecorder';

export default {
    data() {
        return {
            isRecording: false,
            transcription: null,
            response: null,
            audioUrl: null,
            recorder: null,
        };
    },
    
    mounted() {
        this.recorder = new VoiceRecorder({
            onRecordingStart: () => {
                this.isRecording = true;
                this.transcription = null;
                this.response = null;
            },
            onRecordingStop: () => {
                this.isRecording = false;
            },
            onTranscription: (result) => {
                this.transcription = result.transcription;
                this.response = result.response;
                this.audioUrl = result.audio_response?.url;
            },
            onError: (error) => {
                console.error('Recording error:', error);
                alert('Could not access microphone');
            }
        });
    },
    
    methods: {
        startRecording() {
            this.recorder.start();
        },
        
        async stopRecording() {
            await this.recorder.stop();
        }
    }
};
</script>

<style scoped>
.voice-btn {
    @apply w-16 h-16 rounded-full bg-blue-500 text-white flex items-center justify-center;
    @apply hover:bg-blue-600 transition-all;
}

.voice-btn.recording {
    @apply bg-red-500 animate-pulse;
}

.recording-indicator {
    @apply relative w-4 h-4;
}

.pulse {
    @apply absolute w-full h-full bg-white rounded-full;
    animation: pulse 1s infinite;
}

@keyframes pulse {
    0% { transform: scale(1); opacity: 1; }
    50% { transform: scale(1.5); opacity: 0.5; }
    100% { transform: scale(1); opacity: 1; }
}
</style>

Realtime Voice Chat

WebSocket Integration

// app/Events/VoiceMessage.php
namespace App\Events;

use Illuminate\Broadcasting\Channel;
use Illuminate\Broadcasting\InteractsWithSockets;
use Illuminate\Contracts\Broadcasting\ShouldBroadcast;

class VoiceMessage implements ShouldBroadcast
{
    use InteractsWithSockets;
    
    public function __construct(
        public string $roomId,
        public string $userId,
        public string $audioUrl,
        public string $transcription,
    ) {}
    
    public function broadcastOn(): Channel
    {
        return new Channel("voice-room.{$this->roomId}");
    }
}
// app/Http/Controllers/VoiceChatController.php
namespace App\Http\Controllers;

use App\Events\VoiceMessage;
use App\Services\Voice\TranscriptionService;
use Illuminate\Http\Request;
use Illuminate\Support\Facades\Storage;

class VoiceChatController extends Controller
{
    public function __construct(
        private TranscriptionService $transcription
    ) {}
    
    public function sendMessage(Request $request, string $roomId)
    {
        $request->validate([
            'audio' => 'required|file|mimes:webm,mp3,wav|max:10000',
        ]);
        
        // Save audio
        $path = $request->file('audio')->store("voice-messages/{$roomId}", 'public');
        $audioUrl = Storage::disk('public')->url($path);
        
        // Transcribe
        $result = $this->transcription->transcribe($request->file('audio'));
        
        // Broadcast to room
        event(new VoiceMessage(
            roomId: $roomId,
            userId: auth()->id(),
            audioUrl: $audioUrl,
            transcription: $result['text'],
        ));
        
        return response()->json([
            'success' => true,
            'audio_url' => $audioUrl,
            'transcription' => $result['text'],
        ]);
    }
}

Conclusion

Voice interfaces open up many new possibilities for Laravel applications:

  • Accessibility: Help users with disabilities use apps more easily
  • Convenience: Faster interaction than typing
  • Natural: Communicate naturally as with real people
  • Multitasking: Use when hands are busy

Best Practices

  1. Clear feedback: Let users know when recording
  2. Error handling: Handle cases when audio is unclear
  3. Privacy: Notify users about audio recording
  4. Fallback: Always have text input option

References

Comments