[feat] support speaker capture on Windows

2025-12-17 04:26:47 +08:00 · 2024-08-15 11:04:06 +08:00
parent 574b9d10ab
commit 65927c2091
6 changed files with 93 additions and 199 deletions
--- a/src/single_window/render.cpp
+++ b/src/single_window/render.cpp
@@ -168,7 +168,7 @@ int Render::StartScreenCapture() {
        std::chrono::duration<double> duration = now_time - last_frame_time_;
        auto tc = duration.count() * 1000;

-        if (tc >= 0) {
+        if (tc >= 0 && connection_established_) {
          SendData(peer_, DATA_TYPE::VIDEO, (const char *)data,
                   NV12_BUFFER_SIZE);
          last_frame_time_ = now_time;
@@ -203,7 +203,9 @@ int Render::StartSpeakerCapture() {

  int speaker_capturer_init_ret =
      speaker_capturer_->Init([this](unsigned char *data, size_t size) -> void {
+        if (connection_established_) {
          SendData(peer_, DATA_TYPE::AUDIO, (const char *)data, size);
+        }
      });

  if (0 == speaker_capturer_init_ret) {
@@ -421,13 +423,13 @@ int Render::Run() {
    screen_capturer_factory_ = new ScreenCapturerFactory();

    // Speaker capture
-    // speaker_capturer_factory_ = new SpeakerCapturerFactory();
+    speaker_capturer_factory_ = new SpeakerCapturerFactory();

    // Mouse control
    device_controller_factory_ = new DeviceControllerFactory();
  }

-  // StartSpeakerCapture();
+  StartSpeakerCapture();

  // Main loop
  while (!exit_) {
--- a/src/single_window/render_callback_func.cpp
+++ b/src/single_window/render_callback_func.cpp
@@ -13,7 +13,7 @@
 #endif

 int Render::ProcessMouseKeyEven(SDL_Event &ev) {
-  if (!control_mouse_) {
+  if (!control_mouse_ || !connection_established_) {
    return 0;
  }

--- a/src/speaker_capturer/windows/speaker_capturer_wasapi.cpp
+++ b/src/speaker_capturer/windows/speaker_capturer_wasapi.cpp
@@ -1,119 +1,75 @@
 #include "speaker_capturer_wasapi.h"

-#include <algorithm>
-#include <climits>
-#include <iostream>
+#include "rd_log.h"

-#define REFTIMES_PER_SEC 10000000
-#define REFTIMES_PER_MILLISEC 10000
+#define MINIAUDIO_IMPLEMENTATION
+#include "miniaudio.h"

 #define SAVE_AUDIO_FILE 0

-#define CHECK_HR(hres) \
-  if (FAILED(hres)) {  \
-    return -1;         \
+static ma_device_config device_config_;
+static ma_device device_;
+static ma_format format_ = ma_format_s16;
+static ma_uint32 sample_rate_ = ma_standard_sample_rate_48000;
+static ma_uint32 channels_ = 1;
+static FILE* fp_ = nullptr;
+
+void data_callback(ma_device* pDevice, void* pOutput, const void* pInput,
+                   ma_uint32 frameCount) {
+  SpeakerCapturerWasapi* ptr = (SpeakerCapturerWasapi*)pDevice->pUserData;
+  if (ptr) {
+    if (SAVE_AUDIO_FILE) {
+      fwrite(pInput, frameCount * ma_get_bytes_per_frame(format_, channels_), 1,
+             fp_);
    }

-#define SAFE_RELEASE(punk) \
-  if ((punk) != nullptr) { \
-    (punk)->Release();     \
-    (punk) = nullptr;      \
+    ptr->GetCallback()((unsigned char*)pInput,
+                       frameCount * ma_get_bytes_per_frame(format_, channels_));
  }

-const CLSID CLSID_MMDeviceEnumerator = __uuidof(MMDeviceEnumerator);
-const IID IID_IMMDeviceEnumerator = __uuidof(IMMDeviceEnumerator);
-const IID IID_IAudioClient = __uuidof(IAudioClient);
-const IID IID_IAudioCaptureClient = __uuidof(IAudioCaptureClient);
+  (void)pOutput;
+}
+
+SpeakerCapturerWasapi::speaker_data_cb SpeakerCapturerWasapi::GetCallback() {
+  return cb_;
+}

 SpeakerCapturerWasapi::SpeakerCapturerWasapi() {}

 SpeakerCapturerWasapi::~SpeakerCapturerWasapi() {
-  if (inited_ && capture_thread_->joinable()) {
-    capture_thread_->join();
-    inited_ = false;
-  }
-
-  CoTaskMemFree(pwfx);
-  SAFE_RELEASE(pEnumerator)
-  SAFE_RELEASE(pDevice)
-  SAFE_RELEASE(pAudioClient)
-  SAFE_RELEASE(pCaptureClient)
-
  if (SAVE_AUDIO_FILE) {
-    fclose(fp);
+    fclose(fp_);
  }
-
-  // if (pData_dst) delete pData_dst;
-  // pData_dst = nullptr;
 }

 int SpeakerCapturerWasapi::Init(speaker_data_cb cb) {
+  if (inited_) {
+    return 0;
+  }
+
  cb_ = cb;

  if (SAVE_AUDIO_FILE) {
-    fopen_s(&fp, "system_audio.pcm", "wb");
+    fopen_s(&fp_, "system_audio.pcm", "wb");
  }

-  HRESULT hr;
+  ma_result result;
+  ma_backend backends[] = {ma_backend_wasapi};

-  hr = CoCreateInstance(CLSID_MMDeviceEnumerator, nullptr, CLSCTX_ALL,
-                        IID_IMMDeviceEnumerator, (void **)&pEnumerator);
-  CHECK_HR(hr)
+  device_config_ = ma_device_config_init(ma_device_type_loopback);
+  device_config_.capture.pDeviceID = NULL;
+  device_config_.capture.format = format_;
+  device_config_.capture.channels = channels_;
+  device_config_.sampleRate = sample_rate_;
+  device_config_.dataCallback = data_callback;
+  device_config_.pUserData = this;

-  hr = pEnumerator->GetDefaultAudioEndpoint(eRender, eConsole,
-                                            &pDevice);  // 输出
-  CHECK_HR(hr)
-
-  hr = pDevice->Activate(IID_IAudioClient, CLSCTX_ALL, nullptr,
-                         (void **)&pAudioClient);
-  CHECK_HR(hr)
-
-  hr = pAudioClient->GetMixFormat(&pwfx);
-  CHECK_HR(hr)
-
-  // Change to 16bit
-  if (pwfx->wFormatTag == WAVE_FORMAT_IEEE_FLOAT) {
-    pwfx->wFormatTag = WAVE_FORMAT_PCM;
-    pwfx->wBitsPerSample = 16;
-    pwfx->nBlockAlign = pwfx->nChannels * pwfx->wBitsPerSample / 8;
-    pwfx->nAvgBytesPerSec = pwfx->nBlockAlign * pwfx->nSamplesPerSec;
-  } else if (pwfx->wFormatTag == WAVE_FORMAT_EXTENSIBLE) {
-    PWAVEFORMATEXTENSIBLE pEx = reinterpret_cast<PWAVEFORMATEXTENSIBLE>(pwfx);
-    if (IsEqualGUID(KSDATAFORMAT_SUBTYPE_IEEE_FLOAT, pEx->SubFormat)) {
-      pEx->SubFormat = KSDATAFORMAT_SUBTYPE_PCM;
-      pEx->Samples.wValidBitsPerSample = 16;
-      pwfx->wBitsPerSample = 16;
-      pwfx->nBlockAlign = pwfx->nChannels * pwfx->wBitsPerSample / 8;
-      pwfx->nAvgBytesPerSec = pwfx->nBlockAlign * pwfx->nSamplesPerSec;
+  result = ma_device_init_ex(backends, sizeof(backends) / sizeof(backends[0]),
+                             NULL, &device_config_, &device_);
+  if (result != MA_SUCCESS) {
+    LOG_ERROR("Failed to initialize loopback device");
+    return -1;
  }
-  }
-
-  hr = pAudioClient->Initialize(AUDCLNT_SHAREMODE_SHARED,
-                                AUDCLNT_STREAMFLAGS_LOOPBACK, 0, 0, pwfx,
-                                nullptr);
-  CHECK_HR(hr)
-
-  // Get the size of the allocated buffer.
-  hr = pAudioClient->GetBufferSize(&bufferFrameCount);
-  CHECK_HR(hr)
-
-  hr = pAudioClient->GetService(IID_IAudioCaptureClient,
-                                (void **)&pCaptureClient);
-  CHECK_HR(hr)
-
-  // Show audio info
-  {
-    printf("wFormatTag is %x\n", pwfx->wFormatTag);
-    printf("nChannels is %x\n", pwfx->nChannels);
-    printf("nSamplesPerSec is %d\n", pwfx->nSamplesPerSec);
-    printf("nAvgBytesPerSec is %d\n", pwfx->nAvgBytesPerSec);
-    printf("wBitsPerSample is %d\n", pwfx->wBitsPerSample);
-  }
-
-  hnsActualDuration =
-      (double)REFTIMES_PER_SEC * bufferFrameCount / pwfx->nSamplesPerSec;
-
-  // pData_dst = new BYTE[960];

  inited_ = true;

@@ -121,67 +77,18 @@ int SpeakerCapturerWasapi::Init(speaker_data_cb cb) {
 }

 int SpeakerCapturerWasapi::Start() {
-  HRESULT hr;
-  hr = pAudioClient->Start();
-  CHECK_HR(hr)
-
-  capture_thread_.reset(new std::thread([this]() {
-    HRESULT hr;
-
-    // Each loop fills about half of the shared buffer.
-    while (1) {
-      // Sleep for half the buffer duration.
-      Sleep(hnsActualDuration / REFTIMES_PER_MILLISEC / 4);
-
-      hr = pCaptureClient->GetNextPacketSize(&packetLength);
-      CHECK_HR(hr)
-
-      while (packetLength != 0) {
-        // Get the available data in the shared buffer.
-        hr = pCaptureClient->GetBuffer(&pData, &numFramesAvailable, &flags,
-                                       nullptr, &ts);
-        CHECK_HR(hr)
-
-        // flags equals to 2 means silence, set data to nullptr
-        if (flags == AUDCLNT_BUFFERFLAGS_SILENT) {
-          pData = nullptr;
+  ma_result result = ma_device_start(&device_);
+  if (result != MA_SUCCESS) {
+    ma_device_uninit(&device_);
+    LOG_ERROR("Failed to start device");
+    return -1;
  }

-        if (pData != nullptr) {
-          size_t size = numFramesAvailable * pwfx->nBlockAlign;
-
-          for (int i = 0; i < size / 2; i++) {
-            BYTE left = pData[i * 2];
-            BYTE right = pData[i * 2 + 1];
-            // Right channel only?
-            BYTE monoSample = right;
-
-            pData_dst[i] = static_cast<BYTE>(monoSample);
-          }
-
-          cb_(pData_dst, size / 2);
-
-          if (SAVE_AUDIO_FILE) {
-            fwrite(pData_dst, size / 2, 1, fp);
-          }
-        }
-
-        hr = pCaptureClient->ReleaseBuffer(numFramesAvailable);
-        CHECK_HR(hr)
-
-        hr = pCaptureClient->GetNextPacketSize(&packetLength);
-        CHECK_HR(hr)
-      }
-    }
-  }));
-
  return 0;
 }

 int SpeakerCapturerWasapi::Stop() {
-  HRESULT hr;
-  hr = pAudioClient->Stop();
-  CHECK_HR(hr)
+  ma_device_uninit(&device_);
  return 0;
 }

--- a/src/speaker_capturer/windows/speaker_capturer_wasapi.h
+++ b/src/speaker_capturer/windows/speaker_capturer_wasapi.h
@@ -1,20 +1,12 @@
 /*
 * @Author: DI JUNKUN
- * @Date: 2024-07-22
+ * @Date: 2024-08-15
 * Copyright (c) 2024 by DI JUNKUN, All Rights Reserved.
 */

 #ifndef _SPEAKER_CAPTURER_WASAPI_H_
 #define _SPEAKER_CAPTURER_WASAPI_H_

-#include <Audioclient.h>
-#include <Devicetopology.h>
-#include <Endpointvolume.h>
-#include <Mmdeviceapi.h>
-
-#include <thread>
-#include <vector>
-
 #include "speaker_capturer.h"

 class SpeakerCapturerWasapi : public SpeakerCapturer {
@@ -31,31 +23,13 @@ class SpeakerCapturerWasapi : public SpeakerCapturer {
  int Pause();
  int Resume();

+  speaker_data_cb GetCallback();
+
 private:
  speaker_data_cb cb_ = nullptr;

 private:
-  REFERENCE_TIME hnsActualDuration;
-  UINT32 bufferFrameCount;
-  UINT32 numFramesAvailable;
-  BYTE *pData;
-  // std::vector<BYTE> pData_dst;
-  BYTE pData_dst[960];
-  DWORD flags;
-
-  // REFERENCE_TIME hnsRequestedDuration = 10000000;
-  IMMDeviceEnumerator *pEnumerator = NULL;
-  IMMDevice *pDevice = NULL;
-  IAudioClient *pAudioClient = NULL;
-  IAudioCaptureClient *pCaptureClient = NULL;
-  WAVEFORMATEX *pwfx = NULL;
-  UINT32 packetLength = 0;
-  UINT64 pos, ts;
-  FILE *fp;
-
  bool inited_ = false;
-  // thread
-  std::unique_ptr<std::thread> capture_thread_ = nullptr;
 };

 #endif
--- a/test/audio_capture/miniaudio.cpp
+++ b/test/audio_capture/miniaudio.cpp
@@ -18,12 +18,16 @@ buffer in the callback will be null whereas the input buffer will be valid.
 #include <stdio.h>
 #include <stdlib.h>

+FILE* fp;
+
 void data_callback(ma_device* pDevice, void* pOutput, const void* pInput,
                   ma_uint32 frameCount) {
-  ma_encoder* pEncoder = (ma_encoder*)pDevice->pUserData;
-  MA_ASSERT(pEncoder != NULL);
+  // ma_encoder* pEncoder = (ma_encoder*)pDevice->pUserData;
+  // MA_ASSERT(pEncoder != NULL);

-  ma_encoder_write_pcm_frames(pEncoder, pInput, frameCount, NULL);
+  // ma_encoder_write_pcm_frames(pEncoder, pInput, frameCount, NULL);
+
+  fwrite(pInput, frameCount * ma_get_bytes_per_frame(ma_format_s16, 1), 1, fp);

  (void)pOutput;
 }
@@ -35,32 +39,36 @@ int main(int argc, char** argv) {
  ma_device_config deviceConfig;
  ma_device device;

+  fopen_s(&fp, "miniaudio.pcm", "wb");
+
  /* Loopback mode is currently only supported on WASAPI. */
  ma_backend backends[] = {ma_backend_wasapi};

-  if (argc < 2) {
-    printf("No output file.\n");
-    return -1;
-  }
+  // if (argc < 2) {
+  //   printf("No output file.\n");
+  //   return -1;
+  // }

-  encoderConfig =
-      ma_encoder_config_init(ma_encoding_format_wav, ma_format_s16, 1, 48000);
+  // encoderConfig =
+  //     ma_encoder_config_init(ma_encoding_format_wav, ma_format_s16, 1,
+  //     48000);

-  if (ma_encoder_init_file(argv[1], &encoderConfig, &encoder) != MA_SUCCESS) {
-    printf("Failed to initialize output file.\n");
-    return -1;
-  }
+  // if (ma_encoder_init_file(argv[1], &encoderConfig, &encoder) != MA_SUCCESS)
+  // {
+  //   printf("Failed to initialize output file.\n");
+  //   return -1;
+  // }

  deviceConfig = ma_device_config_init(ma_device_type_loopback);
  deviceConfig.capture.pDeviceID =
      NULL; /* Use default device for this example. Set this to the ID of a
               _playback_ device if you want to capture from a specific device.
             */
-  deviceConfig.capture.format = encoder.config.format;
-  deviceConfig.capture.channels = encoder.config.channels;
-  deviceConfig.sampleRate = encoder.config.sampleRate;
+  deviceConfig.capture.format = ma_format_s16;
+  deviceConfig.capture.channels = 1;
+  deviceConfig.sampleRate = 48000;
  deviceConfig.dataCallback = data_callback;
-  deviceConfig.pUserData = &encoder;
+  deviceConfig.pUserData = nullptr;

  result = ma_device_init_ex(backends, sizeof(backends) / sizeof(backends[0]),
                             NULL, &deviceConfig, &device);
@@ -79,8 +87,10 @@ int main(int argc, char** argv) {
  printf("Press Enter to stop recording...\n");
  getchar();

+  fclose(fp);
+
  ma_device_uninit(&device);
-  ma_encoder_uninit(&encoder);
+  // ma_encoder_uninit(&encoder);

  return 0;
 }
--- a/xmake.lua
+++ b/xmake.lua
@@ -3,7 +3,6 @@ set_license("LGPL-3.0")

 set_version("0.0.1")
 add_defines("RD_VERSION=\"0.0.1\"");
-add_defines("MINIAUDIO_IMPLEMENTATION")

 add_rules("mode.release", "mode.debug")
 set_languages("c++17")
@@ -20,6 +19,7 @@ end
 add_requires("spdlog 1.14.1", {system = false})
 add_requires("imgui v1.91.0", {configs = {sdl2 = true, sdl2_renderer = true}})
 add_requires("libyuv")
+add_requires("miniaudio")

 if is_os("windows") then
    add_links("Shell32", "windowsapp", "dwmapi", "User32", "kernel32",
@@ -83,6 +83,7 @@ target("speaker_capturer")
    add_deps("rd_log")
    add_includedirs("src/speaker_capturer", {public = true})
    if is_os("windows") then
+        add_packages("miniaudio")
        add_files("src/speaker_capturer/windows/*.cpp")
        add_includedirs("src/speaker_capturer/windows", {public = true})
    elseif is_os("macosx") then