Webrtc recording delay on windows

23 views Asked by At

The audio_device_core_win recording will cause audio delay (2 seconds or longer) when the computer is relatively stuck.

The demo of official Microsoft has still delay:

https://learn.microsoft.com/zh-TW/windows/win32/coreaudio/capturing-a-stream

#pragma once
#include <windows.h>
#include "WavWapper.h"
#include <wmcodecdsp.h> 
#include <audioclient.h>  // WASAPI
#include <audiopolicy.h>
#include <avrt.h>  // Avrt
#include <endpointvolume.h>
#include <mediaobj.h>     // IMediaObject
#include <mmdeviceapi.h>  // MMDevice
#include <string>

class AudioCapture
{
public:
    AudioCapture();
    ~AudioCapture();

public:
    void startCaputre();
    void stopCaputre();

    void startRecord();
    void stopRecored();

    void setRecordFile(const std::string &path);

private:
    static DWORD WINAPI WSAPICaptureThread(LPVOID context);
    DWORD DoCaptureThread();
    int32_t _GetDeviceName(IMMDevice* pDevice, LPWSTR pszBuffer, int bufferLen);

private:
    volatile bool   m_bRuning;
    volatile bool   m_bRecord;
    HANDLE          m_hRecThread;
    WavWapper*      m_pWavWapper;
    std::string     m_strRecordPath;

    uint16_t _recChannelsPrioList[3];
};

#include "AudioCapture.h"
#include <functiondiscoverykeys_devpkey.h>
#include <chrono>
#include <iostream>
#include <math.h>
#include <stdlib.h>

#define REFTIMES_PER_SEC  10000000
#define REFTIMES_PER_MILLISEC  10000

static const int kAdmMaxDeviceNameSize = 128;

#define EXIT_ON_ERROR(hres)  \
              if (FAILED(hres)) { goto Exit; }
#define SAFE_RELEASE(punk)  \
              if ((punk) != NULL)  \
                { (punk)->Release(); (punk) = NULL; }

const CLSID CLSID_MMDeviceEnumerator = __uuidof(MMDeviceEnumerator);
const IID IID_IMMDeviceEnumerator = __uuidof(IMMDeviceEnumerator);
const IID IID_IAudioClient = __uuidof(IAudioClient);
const IID IID_IAudioCaptureClient = __uuidof(IAudioCaptureClient);

std::string wideCharToMultiByte(wchar_t* pWCStrKey)
{
    //第一次调用确认转换后单字节字符串的长度,用于开辟空间
    int pSize = WideCharToMultiByte(CP_OEMCP, 0, pWCStrKey, wcslen(pWCStrKey), NULL, 0, NULL, NULL);
    char* pCStrKey = new char[pSize + 1];
    //第二次调用将双字节字符串转换成单字节字符串
    WideCharToMultiByte(CP_OEMCP, 0, pWCStrKey, wcslen(pWCStrKey), pCStrKey, pSize, NULL, NULL);
    pCStrKey[pSize] = '\0';
    return pCStrKey;

    delete[] pCStrKey;

    std::string pKey = pCStrKey;

    return pKey;
}

AudioCapture::AudioCapture()
    : m_bRuning(false)
    , m_bRecord(false)
    , m_pWavWapper(nullptr)
{
    _recChannelsPrioList[0] = 2;  // stereo is prio 1
    _recChannelsPrioList[1] = 1;  // mono is prio 2
    _recChannelsPrioList[2] = 4;  // quad is prio 3
}

AudioCapture::~AudioCapture()
{
    stopRecored();
    stopCaputre();
}

void AudioCapture::startCaputre()
{
    if (m_bRuning)
    {
        return;
    }

    m_bRuning = true;

    LPTHREAD_START_ROUTINE lpStartAddress = WSAPICaptureThread;
    m_hRecThread = CreateThread(NULL, 0, lpStartAddress, this, 0, NULL);
    SetThreadPriority(m_hRecThread, THREAD_PRIORITY_TIME_CRITICAL);
}

void AudioCapture::stopCaputre()
{
    if (!m_bRuning)
    {
        return;
    }

    m_bRuning = false;
}

void AudioCapture::startRecord()
{
    if (m_bRecord)
    {
        return;
    }
    m_bRecord = true;
}

void AudioCapture::stopRecored()
{
    if (!m_bRecord)
    {
        return;
    }
    m_bRecord = false;

}

void AudioCapture::setRecordFile(const std::string& path)
{
    m_strRecordPath = path;
}

DWORD WINAPI AudioCapture::WSAPICaptureThread(LPVOID context)
{
    return reinterpret_cast<AudioCapture*>(context)->DoCaptureThread();
}

DWORD AudioCapture::DoCaptureThread()
{
    HRESULT hr;
    REFERENCE_TIME hnsRequestedDuration = REFTIMES_PER_SEC;
    REFERENCE_TIME hnsActualDuration;
    UINT32 bufferFrameCount;
    UINT32 numFramesAvailable;
    IMMDeviceEnumerator* pEnumerator = NULL;
    IMMDevice* pDevice = NULL;
    IAudioClient* pAudioClient = NULL;
    IAudioCaptureClient* pCaptureClient = NULL;
    WAVEFORMATEX* pwfx = NULL;
    UINT32 packetLength = 0;
    BYTE* pData;
    DWORD flags;

    int channnels = 0;
    int samplesPerSec = 0;
    int bitPerSample = 0;
    int blockAlign = 0;

    int nFileNumber = 0;

    hr = CoInitializeEx(NULL, COINIT_MULTITHREADED);
    EXIT_ON_ERROR(hr)

    hr = CoCreateInstance(
        CLSID_MMDeviceEnumerator, NULL,
        CLSCTX_ALL, IID_IMMDeviceEnumerator,
        (void**)&pEnumerator);

    EXIT_ON_ERROR(hr)

    hr = pEnumerator->GetDefaultAudioEndpoint(
            eCapture, eMultimedia, &pDevice);


    WCHAR szDeviceName[MAX_PATH];
    const int bufferLen = sizeof(szDeviceName) / sizeof(szDeviceName)[0];
    _GetDeviceName(pDevice, szDeviceName, bufferLen);

    std::cout << "device:\"" << wideCharToMultiByte(szDeviceName) << "\"" << std::endl;

    EXIT_ON_ERROR(hr)

    hr = pDevice->Activate(
            IID_IAudioClient, CLSCTX_ALL,
            NULL, (void**)&pAudioClient);

    EXIT_ON_ERROR(hr)

    hr = pAudioClient->GetMixFormat(&pwfx);

    EXIT_ON_ERROR(hr)
    std::cout << "nChannels      : " << pwfx->nChannels << std::endl;
    std::cout << "nSamplesPerSec : " << pwfx->nSamplesPerSec << std::endl;
    std::cout << "nAvgBytesPerSec: " << pwfx->nAvgBytesPerSec << std::endl;
    std::cout << "nBlockAlign    : " << pwfx->nBlockAlign << std::endl;
    std::cout << "wBitsPerSample : " << pwfx->wBitsPerSample << std::endl;
    std::cout << "cbSize         : " << pwfx->cbSize << std::endl;


    WAVEFORMATEXTENSIBLE Wfx = WAVEFORMATEXTENSIBLE();
    Wfx.Format.wFormatTag = WAVE_FORMAT_EXTENSIBLE;
    Wfx.Format.wBitsPerSample = 16;
    Wfx.Format.cbSize = 22;
    Wfx.dwChannelMask = 0;
    Wfx.Samples.wValidBitsPerSample = Wfx.Format.wBitsPerSample;
    Wfx.SubFormat = KSDATAFORMAT_SUBTYPE_PCM;


    const int freqs[6] = { 48000, 44100, 16000, 96000, 32000, 8000 };
    hr = S_FALSE;
    WAVEFORMATEX* pWfxClosestMatch = NULL;

    for (unsigned int freq = 0; freq < sizeof(freqs) / sizeof(freqs[0]); freq++) 
    {
        for (unsigned int chan = 0; chan < sizeof(_recChannelsPrioList) / sizeof(_recChannelsPrioList[0]);
            chan++) 
        {
            Wfx.Format.nChannels = _recChannelsPrioList[chan];
            Wfx.Format.nSamplesPerSec = freqs[freq];
            Wfx.Format.nBlockAlign =
                Wfx.Format.nChannels * Wfx.Format.wBitsPerSample / 8;
            Wfx.Format.nAvgBytesPerSec =
                Wfx.Format.nSamplesPerSec * Wfx.Format.nBlockAlign;

            hr = pAudioClient->IsFormatSupported(AUDCLNT_SHAREMODE_SHARED, (WAVEFORMATEX*)&Wfx, &pWfxClosestMatch);
            if (hr == S_OK) 
            {
                break;
            }
            else 
            {
                if (pWfxClosestMatch) 
                {
                    std::cout << "nChannels=" << Wfx.Format.nChannels
                        << ", nSamplesPerSec=" << Wfx.Format.nSamplesPerSec
                        << " is not supported. Closest match: "
                        "nChannels="
                        << pWfxClosestMatch->nChannels << ", nSamplesPerSec="
                        << pWfxClosestMatch->nSamplesPerSec << std::endl;

                    CoTaskMemFree(pWfxClosestMatch);
                    pWfxClosestMatch = NULL;
                }
                else 
                {
                    std::cout << "nChannels=" << Wfx.Format.nChannels
                        << ", nSamplesPerSec=" << Wfx.Format.nSamplesPerSec
                        << " is not supported. No closest match." << std::endl;
                }
            }
        }
        if (hr == S_OK)
        {
            break;
        }
    }

    if (hr == S_OK)
    {
        channnels = Wfx.Format.nChannels;
        samplesPerSec = Wfx.Format.nSamplesPerSec;
        bitPerSample = Wfx.Format.wBitsPerSample;
        blockAlign = Wfx.Format.nBlockAlign;

    }
    std::cout << "-------------------------------" << std::endl;
    std::cout << "channnels:" << channnels << std::endl;
    std::cout << "samplesPerSec:" << samplesPerSec << std::endl;
    std::cout << "bitPerSample:" << bitPerSample << std::endl;
    std::cout << "blockAlign:" << blockAlign << std::endl;
    
//  hr = pAudioClient->Reset();
//  EXIT_ON_ERROR(hr)

    hr = pAudioClient->Initialize(
            AUDCLNT_SHAREMODE_SHARED,
            0,
            hnsActualDuration,
            0,
            (WAVEFORMATEX*)&Wfx,
            NULL);

    EXIT_ON_ERROR(hr)

    hr = pAudioClient->GetBufferSize(&bufferFrameCount);

    EXIT_ON_ERROR(hr)

    hr = pAudioClient->GetService(
            IID_IAudioCaptureClient,
            (void**)&pCaptureClient);

    EXIT_ON_ERROR(hr)


    // Calculate the actual duration of the allocated buffer.
    hnsActualDuration = (double)REFTIMES_PER_SEC *
    bufferFrameCount / pwfx->nSamplesPerSec;

    hr = pAudioClient->Start();  // Start recording.
    EXIT_ON_ERROR(hr)

    // Each loop fills about half of the shared buffer.
    while (m_bRuning)
    {
        // Sleep for half the buffer duration.
        Sleep(hnsActualDuration / REFTIMES_PER_MILLISEC / 2);

        hr = pCaptureClient->GetNextPacketSize(&packetLength);
        EXIT_ON_ERROR(hr)

        while (packetLength != 0)
        {
            UINT64 recTime = 0;
            UINT64 recPos = 0;

            // Get the available data in the shared buffer.
            hr = pCaptureClient->GetBuffer(
                &pData,
                &numFramesAvailable,
                &flags, &recPos, &recTime);

            EXIT_ON_ERROR(hr)

            if (flags & AUDCLNT_BUFFERFLAGS_SILENT)
            {
                pData = NULL;  // Tell CopyData to write silence.
            }

            // 录制文件
            if (m_bRecord)
            {
                if (m_pWavWapper == nullptr)
                {
                    ++nFileNumber;
                    std::string path = m_strRecordPath + "/" + std::to_string(nFileNumber) + ".pcm";

                    m_pWavWapper = new WavWapper;
                    m_pWavWapper->Init(path, channnels, samplesPerSec, bitPerSample);
                }

                if (m_pWavWapper)
                {
                    m_pWavWapper->Write(pData, numFramesAvailable * blockAlign);
                }
            }
            else
            {
                if (m_pWavWapper)
                {
                    m_pWavWapper->CloseFile();
                    delete m_pWavWapper;
                    m_pWavWapper = nullptr;
                }
            }

            hr = pCaptureClient->ReleaseBuffer(numFramesAvailable);
            EXIT_ON_ERROR(hr)

            hr = pCaptureClient->GetNextPacketSize(&packetLength);
            EXIT_ON_ERROR(hr)
        }
    }

    hr = pAudioClient->Stop();  // Stop recording.
    EXIT_ON_ERROR(hr)

Exit:
    CoTaskMemFree(pwfx);
    SAFE_RELEASE(pEnumerator)

        SAFE_RELEASE(pDevice)
        SAFE_RELEASE(pAudioClient)
        SAFE_RELEASE(pCaptureClient)

        return (DWORD)hr;
}

int32_t AudioCapture::_GetDeviceName(IMMDevice* pDevice, LPWSTR pszBuffer, int bufferLen)
{
    static const WCHAR szDefault[] = L"<Device not available>";

    HRESULT hr = E_FAIL;
    IPropertyStore* pProps = NULL;
    PROPVARIANT varName;

    if (pDevice != NULL) 
    {
        hr = pDevice->OpenPropertyStore(STGM_READ, &pProps);
        if (FAILED(hr)) 
        {
            std::cout << "IMMDevice::OpenPropertyStore failed";
        }
    }

    // Initialize container for property value.
    PropVariantInit(&varName);

    if (SUCCEEDED(hr)) 
    {
        // Get the endpoint device's friendly-name property.
        hr = pProps->GetValue(PKEY_Device_FriendlyName, &varName);

        if (FAILED(hr)) 
        {
            std::cout << "IPropertyStore::GetValue failed";
        }
    }

    if ((SUCCEEDED(hr)) && (VT_EMPTY == varName.vt)) 
    {
        hr = E_FAIL;
        std::cout << "IPropertyStore::GetValue returned no value";
    }

    if ((SUCCEEDED(hr)) && (VT_LPWSTR != varName.vt)) 
    {
        // The returned value is not a wide null terminated string.
        hr = E_UNEXPECTED;
        std::cout << "IPropertyStore::GetValue returned unexpected";
    }

    if (SUCCEEDED(hr) && (varName.pwszVal != NULL)) 
    {
        // Copy the valid device name to the provided ouput buffer.
        wcsncpy_s(pszBuffer, bufferLen, varName.pwszVal, _TRUNCATE);
    }
    else {
        // Failed to find the device name.
        wcsncpy_s(pszBuffer, bufferLen, szDefault, _TRUNCATE);
    }

    PropVariantClear(&varName);
    SAFE_RELEASE(pProps);

    return 0;
}
  1. Record the number of audio callbacks in ten seconds
    10sec / 10ms = 1000 times

    When delays occur, the number of audio callbacks is often less than 1,000

  2. If there is a problem with webrtc's audio recording, how to fix it?

0

There are 0 answers