// --- Deepgram SpeechToText API call with ESP32 & SD Card [Arduino IDE, no additional library needed]
// --- Workflow: Reading AUDIO .wav file on SD(SPI) Card, sending to Deepgram Server, receiving Transcription response
// --- repeating endless in loop()
#include <WiFi.h> // all 3 libraries are part of Arduino/ESP32 library (no install needed)
#include <WiFiClientSecure.h>
#include <SD.h>
// Use the Wokwi virtual WiFi access point (no password needed)
const char* ssid = "Wokwi-GUEST";
const char* password = "";
const char* deepgramApiKey = "..."; // add your credentials here !
const char* audio_filename = "/AudioGerman.wav"; // AUDIO on SD Card file to transcript
WiFiClientSecure client;
// ----------------------------------------------------------------------------------------------------------------------------
void setup()
{
Serial.begin(115200);
WiFi.begin(ssid, password);
while (WiFi.status() != WL_CONNECTED)
{ delay(1000);
Serial.println("Connecting to WiFi...");
} Serial.println("Connected to WiFi");
// Initialize SD card
if (!SD.begin())
/*if (!SD.begin(SD_CS_PIN)) // in case we use a dedicated CS pin (beyond SD VSPI Default pins 5,18,19,23) */
{ Serial.println("ERROR - Card Mount Failed");
return;
}
uint8_t cardType = SD.cardType();
if (cardType == CARD_NONE)
{ Serial.println("ERROR - No SD card attached");
return;
} Serial.println("SD card initialized.");
// Connecting to Deepgram, stayinp connected
// info: connectimg once in setup, keeping open for best performance (as we send repeating AUDIO requests in loop below)
client.setInsecure(); // opening encrypted https connection (without Certificate)
if (!client.connect("api.deepgram.com", 443)) // here we connect to Deepgram server
{ Serial.println("ERROR - Connection failed");
return;
} Serial.println("> Connected to Deepgram Server." );
}
// ----------------------------------------------------------------------------------------------------------------------------
void loop()
{
long start_time = millis();
File audioFile = SD.open( audio_filename );
if (!audioFile) {
Serial.println("ERROR - Failed to open file for reading");
return;
}
size_t audio_size = audioFile.size();
audioFile.close();
Serial.println("\n> Audio File [" + (String) audio_filename + "] found, size: " + (String) audio_size );
// ---------- Send HTTPS request header
/*client.println("POST /v1/listen HTTP/1.1");*/
client.println("POST /v1/listen?model=nova-2-general&detect_language=true HTTP/1.1");
client.println("Host: api.deepgram.com");
client.println("Authorization: Token " + String(deepgramApiKey));
client.println("Content-Type: audio/wav");
client.println("Content-Length: " + String(audio_size));
client.println();
Serial.println("> POST Request to Deepgram Server started, sending WAV data now ..." );
// ---------- Reading the AUDIO wav file, sending in CHUNKS (closing file after done)
File file = SD.open( audio_filename, FILE_READ );
const size_t bufferSize = 1024; // we use a 1KB buffer
uint8_t buffer[bufferSize];
size_t bytesRead;
while (file.available())
{ bytesRead = file.read(buffer, sizeof(buffer));
if (bytesRead > 0) client.write(buffer, bytesRead); // sending WAV AUDIO data
}
file.close();
Serial.println("> All bytes sent, waiting Deepgram transcription ... \n-----------------" );
// ---------- Receiving Deepgram Server response
String response = "";
while ( response == "" )
{ while (client.available())
{ char c = client.read();
response += String(c);
}
}
// ---------- Printing complete Deepgram RESPONSE)
Serial.println("Response: " + response );
// ---------- closing connection to Deepgram
// we keep open for best performance on next request in loop()
/* client.stop(); // end connection */
// ---------- Addon (optionally):
// Extracting and printing summary: Transcription + Detected language + Total duration [sec],
// using String operations only (no library needed), alternatively: using json.h libraries
int pos_start, pos_end;
String transcription, language;
String json_Transcript_Start = "\"transcript\":";
String json_Transcript_End = "\"confidence\":";
String json_DetectLang_Start = "\"detected_language\":";
String json_DetectLang_End = "\"language_confidence\":";
pos_start = response.indexOf(json_Transcript_Start);
if (pos_start > 0)
{ pos_start += json_Transcript_Start.length()+1;
pos_end = response.indexOf(json_Transcript_End, pos_start);
} transcription = response.substring(pos_start, pos_end-2);
pos_start = response.indexOf(json_DetectLang_Start);
if (pos_start > 0)
{ pos_start += json_DetectLang_Start.length()+1;
pos_end = response.indexOf(json_DetectLang_End, pos_start);
} language = response.substring(pos_start, pos_end-2);
Serial.println("=> Transcription: [" + transcription + "]" );
Serial.println("=> Detected Language: [" + language + "]" );
Serial.print( "=> Total Duration (from local SD.open(file) to Transcription complete [sec]: ");
Serial.println( float ((millis()-start_time))/1000, 2 ); // print 2 decimal digits
Serial.println("-----------------\n");
}