From 93ccf11fc506b95221ce0c5eddaed9e0e6c8b3b5 Mon Sep 17 00:00:00 2001 From: jacqueline Date: Tue, 8 Aug 2023 13:47:08 +1000 Subject: [PATCH] Investigate and improve core pinning for resampler --- src/app_console/app_console.cpp | 22 +++++- src/audio/audio_task.cpp | 5 +- src/audio/i2s_audio_output.cpp | 3 +- src/audio/resample.cpp | 116 ++++++++++++++++---------------- src/audio/sink_mixer.cpp | 5 +- 5 files changed, 90 insertions(+), 61 deletions(-) diff --git a/src/app_console/app_console.cpp b/src/app_console/app_console.cpp index a2fd749b..8686ac58 100644 --- a/src/app_console/app_console.cpp +++ b/src/app_console/app_console.cpp @@ -19,6 +19,7 @@ #include #include +#include "FreeRTOSConfig.h" #include "audio_events.hpp" #include "audio_fsm.hpp" #include "database.hpp" @@ -27,6 +28,7 @@ #include "esp_log.h" #include "event_queue.hpp" #include "ff.h" +#include "freertos/FreeRTOSConfig_arch.h" #include "freertos/projdefs.h" #include "index.hpp" #include "track.hpp" @@ -328,6 +330,12 @@ void RegisterDbDump() { } int CmdTaskStats(int argc, char** argv) { + if (!configUSE_TRACE_FACILITY) { + std::cout << "configUSE_TRACE_FACILITY must be enabled" << std::endl; + std::cout << "also consider configTASKLIST_USE_COREID" << std::endl; + return 1; + } + static const std::string usage = "usage: task_stats"; if (argc != 1) { std::cout << usage << std::endl; @@ -379,6 +387,14 @@ int CmdTaskStats(int argc, char** argv) { str << "\t"; } + if (configTASKLIST_INCLUDE_COREID) { + if (start_status[i].xCoreID == tskNO_AFFINITY) { + str << "any\t"; + } else { + str << start_status[i].xCoreID << "\t"; + } + } + str << std::fixed << std::setprecision(1) << depth_kib; str << " KiB"; if (depth_kib >= 10) { @@ -399,7 +415,11 @@ int CmdTaskStats(int argc, char** argv) { return first.first >= second.first; }); - std::cout << "name\t\tfree stack\trun time" << std::endl; + std::cout << "name\t\t"; + if (configTASKLIST_INCLUDE_COREID) { + std::cout << "core\t"; + } + std::cout << "free stack\trun time" << std::endl; for (const auto& i : info_strings) { std::cout << i.second << std::endl; } diff --git a/src/audio/audio_task.cpp b/src/audio/audio_task.cpp index 7c038730..75b44594 100644 --- a/src/audio/audio_task.cpp +++ b/src/audio/audio_task.cpp @@ -109,7 +109,10 @@ auto Timer::bytes_to_samples(uint32_t bytes) -> uint32_t { auto AudioTask::Start(IAudioSource* source, IAudioSink* sink) -> AudioTask* { AudioTask* task = new AudioTask(source, sink); - tasks::StartPersistent([=]() { task->Main(); }); + // Pin to CORE1 because codecs should be fixed point anyway, and being on + // the opposite core to the mixer maximises throughput in the worst case + // (some heavy codec like opus + resampling for bluetooth). + tasks::StartPersistent(1, [=]() { task->Main(); }); return task; } diff --git a/src/audio/i2s_audio_output.cpp b/src/audio/i2s_audio_output.cpp index e53dbe2a..d60ddfa4 100644 --- a/src/audio/i2s_audio_output.cpp +++ b/src/audio/i2s_audio_output.cpp @@ -120,7 +120,8 @@ auto I2SAudioOutput::PrepareFormat(const StreamInfo::Pcm& orig) return StreamInfo::Pcm{ .channels = std::min(orig.channels, 2), .bits_per_sample = std::clamp(orig.bits_per_sample, 16, 32), - .sample_rate = std::clamp(orig.sample_rate, 8000, 96000), + .sample_rate = 44100, + //.sample_rate = std::clamp(orig.sample_rate, 8000, 96000), }; } diff --git a/src/audio/resample.cpp b/src/audio/resample.cpp index 7accd0a1..430a6a26 100644 --- a/src/audio/resample.cpp +++ b/src/audio/resample.cpp @@ -1,4 +1,17 @@ +/* + * Copyright 2023 jacqueline + * + * SPDX-License-Identifier: GPL-3.0-only + */ #include "resample.hpp" +/* + * This file contains the implementation for a 32-bit floating point resampler. + * It is largely based on David Bryant's ART resampler, which is BSD-licensed, + * and available at https://github.com/dbry/audio-resampler/. + * + * This resampler uses windowed sinc interpolation filters, with an additional + * lowpass filter to reduce aliasing. + */ #include #include @@ -14,13 +27,11 @@ namespace audio { -static constexpr char kTag[] = "resample"; - static constexpr double kLowPassRatio = 0.5; static constexpr size_t kNumFilters = 64; -static constexpr size_t kTapsPerFilter = 16; +static constexpr size_t kFilterSize = 16; -typedef std::array Filter; +typedef std::array Filter; static std::array sFilters{}; static bool sFiltersInitialised = false; @@ -35,15 +46,15 @@ Resampler::Resampler(uint32_t source_sample_rate, static_cast(source_sample_rate)), num_channels_(num_channels) { channel_buffers_.resize(num_channels); - channel_buffer_size_ = kTapsPerFilter * 16; + channel_buffer_size_ = kFilterSize * 16; for (int i = 0; i < num_channels; i++) { channel_buffers_[i] = static_cast(calloc(sizeof(float), channel_buffer_size_)); } - output_offset_ = kTapsPerFilter / 2.0f; - input_index_ = kTapsPerFilter; + output_offset_ = kFilterSize / 2.0f; + input_index_ = kFilterSize; if (!sFiltersInitialised) { sFiltersInitialised = true; @@ -64,7 +75,7 @@ auto Resampler::Process(cpp::span input, size_t input_frames = input.size() / num_channels_; size_t output_frames = output.size() / num_channels_; - int half_taps = kTapsPerFilter / 2; + int half_taps = kFilterSize / 2; while (output_frames > 0) { if (output_offset_ >= input_index_ - half_taps) { if (input_frames > 0) { @@ -74,12 +85,12 @@ auto Resampler::Process(cpp::span input, if (input_index_ == channel_buffer_size_) { for (int i = 0; i < num_channels_; ++i) { memmove(channel_buffers_[i], - channel_buffers_[i] + channel_buffer_size_ - kTapsPerFilter, - kTapsPerFilter * sizeof(float)); + channel_buffers_[i] + channel_buffer_size_ - kFilterSize, + kFilterSize * sizeof(float)); } - output_offset_ -= channel_buffer_size_ - kTapsPerFilter; - input_index_ -= channel_buffer_size_ - kTapsPerFilter; + output_offset_ -= channel_buffer_size_ - kFilterSize; + input_index_ -= channel_buffer_size_ - kFilterSize; } for (int i = 0; i < num_channels_; ++i) { @@ -97,7 +108,11 @@ auto Resampler::Process(cpp::span input, output[samples_produced++] = sample::FromFloat(Subsample(i)); } - output_offset_ += (1.0f / factor_); + // NOTE: floating point division here is potentially slow due to FPU + // limitations. Consider explicitly bunding the xtensa libgcc divsion via + // reciprocal implementation if we care about portability between + // compilers. + output_offset_ += 1.0f / factor_; output_frames--; } } @@ -105,36 +120,34 @@ auto Resampler::Process(cpp::span input, return {samples_used, samples_produced}; } +/* + * Constructs the filter in-place for the given index of sFilters. This only + * needs to be done once, per-filter. 64-bit math is okay here, because filters + * will not be initialised within a performance critical path. + */ auto InitFilter(int index) -> void { - const double a0 = 0.35875; - const double a1 = 0.48829; - const double a2 = 0.14128; - const double a3 = 0.01168; + Filter& filter = sFilters[index]; + std::array working_buffer{}; - double fraction = - static_cast(index) / static_cast(kNumFilters); + double fraction = index / static_cast(kNumFilters); double filter_sum = 0.0; - // "dist" is the absolute distance from the sinc maximum to the filter tap to - // be calculated, in radians "ratio" is that distance divided by half the tap - // count such that it reaches π at the window extremes - - // Note that with this scaling, the odd terms of the Blackman-Harris - // calculation appear to be negated with respect to the reference formula - // version. + for (int i = 0; i < kFilterSize; ++i) { + // "dist" is the absolute distance from the sinc maximum to the filter tap + // to be calculated, in radians. + double dist = fabs((kFilterSize / 2.0 - 1.0) + fraction - i) * M_PI; + // "ratio" is that distance divided by half the tap count such that it + // reaches π at the window extremes + double ratio = dist / (kFilterSize / 2.0); - Filter& filter = sFilters[index]; - std::array working_buffer{}; - for (int i = 0; i < kTapsPerFilter; ++i) { - double dist = fabs((kTapsPerFilter / 2.0 - 1.0) + fraction - i) * M_PI; - double ratio = dist / (kTapsPerFilter / 2.0); double value; - if (dist != 0.0) { value = sin(dist * kLowPassRatio) / (dist * kLowPassRatio); - // Blackman-Harris window - value *= a0 + a1 * cos(ratio) + a2 * cos(2 * ratio) + a3 * cos(3 * ratio); + // Hann window. We could alternatively use a Blackman Harris window, + // however our unusually small filter size makes the Hann window's + // steeper cutoff more important. + value *= 0.5 * (1.0 + cos(ratio)); } else { value = 1.0; } @@ -143,50 +156,39 @@ auto InitFilter(int index) -> void { filter_sum += value; } - // filter should have unity DC gain - + // Filter should have unity DC gain double scaler = 1.0 / filter_sum; double error = 0.0; - for (int i = kTapsPerFilter / 2; i < kTapsPerFilter; - i = kTapsPerFilter - i - (i >= kTapsPerFilter / 2)) { + for (int i = kFilterSize / 2; i < kFilterSize; + i = kFilterSize - i - (i >= kFilterSize / 2)) { working_buffer[i] *= scaler; filter[i] = working_buffer[i] - error; error += static_cast(filter[i]) - working_buffer[i]; } } +/* + * Performs sub-sampling with interpolation for the given channel. Assumes that + * the channel buffer has already been filled with samples. + */ auto Resampler::Subsample(int channel) -> float { - float sum1, sum2; - cpp::span source{channel_buffers_[channel], channel_buffer_size_}; int offset_integral = std::floor(output_offset_); source = source.subspan(offset_integral); float offset_fractional = output_offset_ - offset_integral; - /* -// no interpolate -size_t filter_index = std::floor(offset_fractional * kNumFilters + 0.5f); -//ESP_LOGI(kTag, "selected filter %u of %u", filter_index, kNumFilters); -int start_offset = kTapsPerFilter / 2 + 1; -//ESP_LOGI(kTag, "using offset of %i, length %u", start_offset, kTapsPerFilter); - -return ApplyFilter( - sFilters[filter_index], - {source.data() - start_offset, kTapsPerFilter}); - */ - offset_fractional *= kNumFilters; int filter_index = std::floor(offset_fractional); - sum1 = ApplyFilter(sFilters[filter_index], - {source.data() - kTapsPerFilter / 2 + 1, kTapsPerFilter}); + float sum1 = ApplyFilter(sFilters[filter_index], + {source.data() - kFilterSize / 2 + 1, kFilterSize}); offset_fractional -= filter_index; - sum2 = ApplyFilter(sFilters[filter_index + 1], - {source.data() - kTapsPerFilter / 2 + 1, kTapsPerFilter}); + float sum2 = ApplyFilter(sFilters[filter_index + 1], + {source.data() - kFilterSize / 2 + 1, kFilterSize}); return (sum2 * offset_fractional) + (sum1 * (1.0f - offset_fractional)); } @@ -194,7 +196,7 @@ return ApplyFilter( auto Resampler::ApplyFilter(cpp::span filter, cpp::span input) -> float { float sum = 0.0; - for (int i = 0; i < kTapsPerFilter; i++) { + for (int i = 0; i < kFilterSize; i++) { sum += filter[i] * input[i]; } return sum; diff --git a/src/audio/sink_mixer.cpp b/src/audio/sink_mixer.cpp index 5a5a8616..6c72c8b0 100644 --- a/src/audio/sink_mixer.cpp +++ b/src/audio/sink_mixer.cpp @@ -38,7 +38,10 @@ SinkMixer::SinkMixer(StreamBufferHandle_t dest) input_stream_.reset(new RawStream(kSampleBufferLength)); resampled_stream_.reset(new RawStream(kSampleBufferLength)); - tasks::StartPersistent(1, [&]() { Main(); }); + // Pin to CORE0 because we need the FPU. + // FIXME: A fixed point implementation could run freely on either core, + // which should lead to a big performance increase. + tasks::StartPersistent(0, [&]() { Main(); }); } SinkMixer::~SinkMixer() {