From 65927c2091e6cdb6e29af95e7606a00a14721daf Mon Sep 17 00:00:00 2001
From: dijunkun <junkun.di@hotmail.com>
Date: Thu, 15 Aug 2024 11:04:06 +0800
Subject: [PATCH] [feat] support speaker capture on Windows

---
 src/single_window/render.cpp                  |  10 +-
 src/single_window/render_callback_func.cpp    |   2 +-
 .../windows/speaker_capturer_wasapi.cpp       | 199 +++++-------------
 .../windows/speaker_capturer_wasapi.h         |  32 +--
 test/audio_capture/miniaudio.cpp              |  46 ++--
 xmake.lua                                     |   3 +-
 6 files changed, 93 insertions(+), 199 deletions(-)
diff --git a/src/single_window/render.cpp b/src/single_window/render.cpp
index ab59936..155da50 100644
--- a/src/single_window/render.cpp
+++ b/src/single_window/render.cpp
@@ -168,7 +168,7 @@ int Render::StartScreenCapture() {
         std::chrono::duration<double> duration = now_time - last_frame_time_;
         auto tc = duration.count() * 1000;
 
-        if (tc >= 0) {
+        if (tc >= 0 && connection_established_) {
           SendData(peer_, DATA_TYPE::VIDEO, (const char *)data,
                    NV12_BUFFER_SIZE);
           last_frame_time_ = now_time;
@@ -203,7 +203,9 @@ int Render::StartSpeakerCapture() {
 
   int speaker_capturer_init_ret =
       speaker_capturer_->Init([this](unsigned char *data, size_t size) -> void {
-        SendData(peer_, DATA_TYPE::AUDIO, (const char *)data, size);
+        if (connection_established_) {
+          SendData(peer_, DATA_TYPE::AUDIO, (const char *)data, size);
+        }
       });
 
   if (0 == speaker_capturer_init_ret) {
@@ -421,13 +423,13 @@ int Render::Run() {
     screen_capturer_factory_ = new ScreenCapturerFactory();
 
     // Speaker capture
-    // speaker_capturer_factory_ = new SpeakerCapturerFactory();
+    speaker_capturer_factory_ = new SpeakerCapturerFactory();
 
     // Mouse control
     device_controller_factory_ = new DeviceControllerFactory();
   }
 
-  // StartSpeakerCapture();
+  StartSpeakerCapture();
 
   // Main loop
   while (!exit_) {
diff --git a/src/single_window/render_callback_func.cpp b/src/single_window/render_callback_func.cpp
index 965db97..725e471 100644
--- a/src/single_window/render_callback_func.cpp
+++ b/src/single_window/render_callback_func.cpp
@@ -13,7 +13,7 @@
 #endif
 
 int Render::ProcessMouseKeyEven(SDL_Event &ev) {
-  if (!control_mouse_) {
+  if (!control_mouse_ || !connection_established_) {
     return 0;
   }
 
diff --git a/src/speaker_capturer/windows/speaker_capturer_wasapi.cpp b/src/speaker_capturer/windows/speaker_capturer_wasapi.cpp
index 37cddd0..61e5b9e 100644
--- a/src/speaker_capturer/windows/speaker_capturer_wasapi.cpp
+++ b/src/speaker_capturer/windows/speaker_capturer_wasapi.cpp
@@ -1,187 +1,94 @@
 #include "speaker_capturer_wasapi.h"
 
-#include <algorithm>
-#include <climits>
-#include <iostream>
+#include "rd_log.h"
 
-#define REFTIMES_PER_SEC 10000000
-#define REFTIMES_PER_MILLISEC 10000
+#define MINIAUDIO_IMPLEMENTATION
+#include "miniaudio.h"
 
 #define SAVE_AUDIO_FILE 0
 
-#define CHECK_HR(hres) \
-  if (FAILED(hres)) {  \
-    return -1;         \
+static ma_device_config device_config_;
+static ma_device device_;
+static ma_format format_ = ma_format_s16;
+static ma_uint32 sample_rate_ = ma_standard_sample_rate_48000;
+static ma_uint32 channels_ = 1;
+static FILE* fp_ = nullptr;
+
+void data_callback(ma_device* pDevice, void* pOutput, const void* pInput,
+                   ma_uint32 frameCount) {
+  SpeakerCapturerWasapi* ptr = (SpeakerCapturerWasapi*)pDevice->pUserData;
+  if (ptr) {
+    if (SAVE_AUDIO_FILE) {
+      fwrite(pInput, frameCount * ma_get_bytes_per_frame(format_, channels_), 1,
+             fp_);
+    }
+
+    ptr->GetCallback()((unsigned char*)pInput,
+                       frameCount * ma_get_bytes_per_frame(format_, channels_));
   }
 
-#define SAFE_RELEASE(punk) \
-  if ((punk) != nullptr) { \
-    (punk)->Release();     \
-    (punk) = nullptr;      \
-  }
+  (void)pOutput;
+}
 
-const CLSID CLSID_MMDeviceEnumerator = __uuidof(MMDeviceEnumerator);
-const IID IID_IMMDeviceEnumerator = __uuidof(IMMDeviceEnumerator);
-const IID IID_IAudioClient = __uuidof(IAudioClient);
-const IID IID_IAudioCaptureClient = __uuidof(IAudioCaptureClient);
+SpeakerCapturerWasapi::speaker_data_cb SpeakerCapturerWasapi::GetCallback() {
+  return cb_;
+}
 
 SpeakerCapturerWasapi::SpeakerCapturerWasapi() {}
 
 SpeakerCapturerWasapi::~SpeakerCapturerWasapi() {
-  if (inited_ && capture_thread_->joinable()) {
-    capture_thread_->join();
-    inited_ = false;
-  }
-
-  CoTaskMemFree(pwfx);
-  SAFE_RELEASE(pEnumerator)
-  SAFE_RELEASE(pDevice)
-  SAFE_RELEASE(pAudioClient)
-  SAFE_RELEASE(pCaptureClient)
-
   if (SAVE_AUDIO_FILE) {
-    fclose(fp);
+    fclose(fp_);
   }
-
-  // if (pData_dst) delete pData_dst;
-  // pData_dst = nullptr;
 }
 
 int SpeakerCapturerWasapi::Init(speaker_data_cb cb) {
+  if (inited_) {
+    return 0;
+  }
+
   cb_ = cb;
 
   if (SAVE_AUDIO_FILE) {
-    fopen_s(&fp, "system_audio.pcm", "wb");
+    fopen_s(&fp_, "system_audio.pcm", "wb");
   }
 
-  HRESULT hr;
+  ma_result result;
+  ma_backend backends[] = {ma_backend_wasapi};
 
-  hr = CoCreateInstance(CLSID_MMDeviceEnumerator, nullptr, CLSCTX_ALL,
-                        IID_IMMDeviceEnumerator, (void **)&pEnumerator);
-  CHECK_HR(hr)
+  device_config_ = ma_device_config_init(ma_device_type_loopback);
+  device_config_.capture.pDeviceID = NULL;
+  device_config_.capture.format = format_;
+  device_config_.capture.channels = channels_;
+  device_config_.sampleRate = sample_rate_;
+  device_config_.dataCallback = data_callback;
+  device_config_.pUserData = this;
 
-  hr = pEnumerator->GetDefaultAudioEndpoint(eRender, eConsole,
-                                            &pDevice);  // 输出
-  CHECK_HR(hr)
-
-  hr = pDevice->Activate(IID_IAudioClient, CLSCTX_ALL, nullptr,
-                         (void **)&pAudioClient);
-  CHECK_HR(hr)
-
-  hr = pAudioClient->GetMixFormat(&pwfx);
-  CHECK_HR(hr)
-
-  // Change to 16bit
-  if (pwfx->wFormatTag == WAVE_FORMAT_IEEE_FLOAT) {
-    pwfx->wFormatTag = WAVE_FORMAT_PCM;
-    pwfx->wBitsPerSample = 16;
-    pwfx->nBlockAlign = pwfx->nChannels * pwfx->wBitsPerSample / 8;
-    pwfx->nAvgBytesPerSec = pwfx->nBlockAlign * pwfx->nSamplesPerSec;
-  } else if (pwfx->wFormatTag == WAVE_FORMAT_EXTENSIBLE) {
-    PWAVEFORMATEXTENSIBLE pEx = reinterpret_cast<PWAVEFORMATEXTENSIBLE>(pwfx);
-    if (IsEqualGUID(KSDATAFORMAT_SUBTYPE_IEEE_FLOAT, pEx->SubFormat)) {
-      pEx->SubFormat = KSDATAFORMAT_SUBTYPE_PCM;
-      pEx->Samples.wValidBitsPerSample = 16;
-      pwfx->wBitsPerSample = 16;
-      pwfx->nBlockAlign = pwfx->nChannels * pwfx->wBitsPerSample / 8;
-      pwfx->nAvgBytesPerSec = pwfx->nBlockAlign * pwfx->nSamplesPerSec;
-    }
+  result = ma_device_init_ex(backends, sizeof(backends) / sizeof(backends[0]),
+                             NULL, &device_config_, &device_);
+  if (result != MA_SUCCESS) {
+    LOG_ERROR("Failed to initialize loopback device");
+    return -1;
   }
 
-  hr = pAudioClient->Initialize(AUDCLNT_SHAREMODE_SHARED,
-                                AUDCLNT_STREAMFLAGS_LOOPBACK, 0, 0, pwfx,
-                                nullptr);
-  CHECK_HR(hr)
-
-  // Get the size of the allocated buffer.
-  hr = pAudioClient->GetBufferSize(&bufferFrameCount);
-  CHECK_HR(hr)
-
-  hr = pAudioClient->GetService(IID_IAudioCaptureClient,
-                                (void **)&pCaptureClient);
-  CHECK_HR(hr)
-
-  // Show audio info
-  {
-    printf("wFormatTag is %x\n", pwfx->wFormatTag);
-    printf("nChannels is %x\n", pwfx->nChannels);
-    printf("nSamplesPerSec is %d\n", pwfx->nSamplesPerSec);
-    printf("nAvgBytesPerSec is %d\n", pwfx->nAvgBytesPerSec);
-    printf("wBitsPerSample is %d\n", pwfx->wBitsPerSample);
-  }
-
-  hnsActualDuration =
-      (double)REFTIMES_PER_SEC * bufferFrameCount / pwfx->nSamplesPerSec;
-
-  // pData_dst = new BYTE[960];
-
   inited_ = true;
 
   return 0;
 }
 
 int SpeakerCapturerWasapi::Start() {
-  HRESULT hr;
-  hr = pAudioClient->Start();
-  CHECK_HR(hr)
-
-  capture_thread_.reset(new std::thread([this]() {
-    HRESULT hr;
-
-    // Each loop fills about half of the shared buffer.
-    while (1) {
-      // Sleep for half the buffer duration.
-      Sleep(hnsActualDuration / REFTIMES_PER_MILLISEC / 4);
-
-      hr = pCaptureClient->GetNextPacketSize(&packetLength);
-      CHECK_HR(hr)
-
-      while (packetLength != 0) {
-        // Get the available data in the shared buffer.
-        hr = pCaptureClient->GetBuffer(&pData, &numFramesAvailable, &flags,
-                                       nullptr, &ts);
-        CHECK_HR(hr)
-
-        // flags equals to 2 means silence, set data to nullptr
-        if (flags == AUDCLNT_BUFFERFLAGS_SILENT) {
-          pData = nullptr;
-        }
-
-        if (pData != nullptr) {
-          size_t size = numFramesAvailable * pwfx->nBlockAlign;
-
-          for (int i = 0; i < size / 2; i++) {
-            BYTE left = pData[i * 2];
-            BYTE right = pData[i * 2 + 1];
-            // Right channel only?
-            BYTE monoSample = right;
-
-            pData_dst[i] = static_cast<BYTE>(monoSample);
-          }
-
-          cb_(pData_dst, size / 2);
-
-          if (SAVE_AUDIO_FILE) {
-            fwrite(pData_dst, size / 2, 1, fp);
-          }
-        }
-
-        hr = pCaptureClient->ReleaseBuffer(numFramesAvailable);
-        CHECK_HR(hr)
-
-        hr = pCaptureClient->GetNextPacketSize(&packetLength);
-        CHECK_HR(hr)
-      }
-    }
-  }));
+  ma_result result = ma_device_start(&device_);
+  if (result != MA_SUCCESS) {
+    ma_device_uninit(&device_);
+    LOG_ERROR("Failed to start device");
+    return -1;
+  }
 
   return 0;
 }
 
 int SpeakerCapturerWasapi::Stop() {
-  HRESULT hr;
-  hr = pAudioClient->Stop();
-  CHECK_HR(hr)
+  ma_device_uninit(&device_);
   return 0;
 }
 
diff --git a/src/speaker_capturer/windows/speaker_capturer_wasapi.h b/src/speaker_capturer/windows/speaker_capturer_wasapi.h
index 3bfafab..e2a2a9e 100644
--- a/src/speaker_capturer/windows/speaker_capturer_wasapi.h
+++ b/src/speaker_capturer/windows/speaker_capturer_wasapi.h
@@ -1,20 +1,12 @@
 /*
  * @Author: DI JUNKUN
- * @Date: 2024-07-22
+ * @Date: 2024-08-15
  * Copyright (c) 2024 by DI JUNKUN, All Rights Reserved.
  */
 
 #ifndef _SPEAKER_CAPTURER_WASAPI_H_
 #define _SPEAKER_CAPTURER_WASAPI_H_
 
-#include <Audioclient.h>
-#include <Devicetopology.h>
-#include <Endpointvolume.h>
-#include <Mmdeviceapi.h>
-
-#include <thread>
-#include <vector>
-
 #include "speaker_capturer.h"
 
 class SpeakerCapturerWasapi : public SpeakerCapturer {
@@ -31,31 +23,13 @@ class SpeakerCapturerWasapi : public SpeakerCapturer {
   int Pause();
   int Resume();
 
+  speaker_data_cb GetCallback();
+
  private:
   speaker_data_cb cb_ = nullptr;
 
  private:
-  REFERENCE_TIME hnsActualDuration;
-  UINT32 bufferFrameCount;
-  UINT32 numFramesAvailable;
-  BYTE *pData;
-  // std::vector<BYTE> pData_dst;
-  BYTE pData_dst[960];
-  DWORD flags;
-
-  // REFERENCE_TIME hnsRequestedDuration = 10000000;
-  IMMDeviceEnumerator *pEnumerator = NULL;
-  IMMDevice *pDevice = NULL;
-  IAudioClient *pAudioClient = NULL;
-  IAudioCaptureClient *pCaptureClient = NULL;
-  WAVEFORMATEX *pwfx = NULL;
-  UINT32 packetLength = 0;
-  UINT64 pos, ts;
-  FILE *fp;
-
   bool inited_ = false;
-  // thread
-  std::unique_ptr<std::thread> capture_thread_ = nullptr;
 };
 
 #endif
\ No newline at end of file
diff --git a/test/audio_capture/miniaudio.cpp b/test/audio_capture/miniaudio.cpp
index 8169e13..03aaa43 100644
--- a/test/audio_capture/miniaudio.cpp
+++ b/test/audio_capture/miniaudio.cpp
@@ -18,12 +18,16 @@ buffer in the callback will be null whereas the input buffer will be valid.
 #include <stdio.h>
 #include <stdlib.h>
 
+FILE* fp;
+
 void data_callback(ma_device* pDevice, void* pOutput, const void* pInput,
                    ma_uint32 frameCount) {
-  ma_encoder* pEncoder = (ma_encoder*)pDevice->pUserData;
-  MA_ASSERT(pEncoder != NULL);
+  // ma_encoder* pEncoder = (ma_encoder*)pDevice->pUserData;
+  // MA_ASSERT(pEncoder != NULL);
 
-  ma_encoder_write_pcm_frames(pEncoder, pInput, frameCount, NULL);
+  // ma_encoder_write_pcm_frames(pEncoder, pInput, frameCount, NULL);
+
+  fwrite(pInput, frameCount * ma_get_bytes_per_frame(ma_format_s16, 1), 1, fp);
 
   (void)pOutput;
 }
@@ -35,32 +39,36 @@ int main(int argc, char** argv) {
   ma_device_config deviceConfig;
   ma_device device;
 
+  fopen_s(&fp, "miniaudio.pcm", "wb");
+
   /* Loopback mode is currently only supported on WASAPI. */
   ma_backend backends[] = {ma_backend_wasapi};
 
-  if (argc < 2) {
-    printf("No output file.\n");
-    return -1;
-  }
+  // if (argc < 2) {
+  //   printf("No output file.\n");
+  //   return -1;
+  // }
 
-  encoderConfig =
-      ma_encoder_config_init(ma_encoding_format_wav, ma_format_s16, 1, 48000);
+  // encoderConfig =
+  //     ma_encoder_config_init(ma_encoding_format_wav, ma_format_s16, 1,
+  //     48000);
 
-  if (ma_encoder_init_file(argv[1], &encoderConfig, &encoder) != MA_SUCCESS) {
-    printf("Failed to initialize output file.\n");
-    return -1;
-  }
+  // if (ma_encoder_init_file(argv[1], &encoderConfig, &encoder) != MA_SUCCESS)
+  // {
+  //   printf("Failed to initialize output file.\n");
+  //   return -1;
+  // }
 
   deviceConfig = ma_device_config_init(ma_device_type_loopback);
   deviceConfig.capture.pDeviceID =
       NULL; /* Use default device for this example. Set this to the ID of a
                _playback_ device if you want to capture from a specific device.
              */
-  deviceConfig.capture.format = encoder.config.format;
-  deviceConfig.capture.channels = encoder.config.channels;
-  deviceConfig.sampleRate = encoder.config.sampleRate;
+  deviceConfig.capture.format = ma_format_s16;
+  deviceConfig.capture.channels = 1;
+  deviceConfig.sampleRate = 48000;
   deviceConfig.dataCallback = data_callback;
-  deviceConfig.pUserData = &encoder;
+  deviceConfig.pUserData = nullptr;
 
   result = ma_device_init_ex(backends, sizeof(backends) / sizeof(backends[0]),
                              NULL, &deviceConfig, &device);
@@ -79,8 +87,10 @@ int main(int argc, char** argv) {
   printf("Press Enter to stop recording...\n");
   getchar();
 
+  fclose(fp);
+
   ma_device_uninit(&device);
-  ma_encoder_uninit(&encoder);
+  // ma_encoder_uninit(&encoder);
 
   return 0;
 }
\ No newline at end of file
diff --git a/xmake.lua b/xmake.lua
index 51ebecb..c3a4b69 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -3,7 +3,6 @@ set_license("LGPL-3.0")
 
 set_version("0.0.1")
 add_defines("RD_VERSION=\"0.0.1\"");
-add_defines("MINIAUDIO_IMPLEMENTATION")
 
 add_rules("mode.release", "mode.debug")
 set_languages("c++17")
@@ -20,6 +19,7 @@ end
 add_requires("spdlog 1.14.1", {system = false})
 add_requires("imgui v1.91.0", {configs = {sdl2 = true, sdl2_renderer = true}})
 add_requires("libyuv")
+add_requires("miniaudio")
 
 if is_os("windows") then
     add_links("Shell32", "windowsapp", "dwmapi", "User32", "kernel32",
@@ -83,6 +83,7 @@ target("speaker_capturer")
     add_deps("rd_log")
     add_includedirs("src/speaker_capturer", {public = true})
     if is_os("windows") then
+        add_packages("miniaudio")
         add_files("src/speaker_capturer/windows/*.cpp")
         add_includedirs("src/speaker_capturer/windows", {public = true})
     elseif is_os("macosx") then