I want to make a C++ lib named cppdub which will mimic the python module pydub.
One main function is to export the AudioSegment to a file with a specific format (example: mp3).
The code is:
AudioSegment AudioSegment::from_file(const std::string& file_path, const std::string& format, const std::string& codec,
const std::map<std::string, int>& parameters, int start_second, int duration) {
avformat_network_init();
av_log_set_level(AV_LOG_ERROR); // Adjust logging level as needed
AVFormatContext* format_ctx = nullptr;
if (avformat_open_input(&format_ctx, file_path.c_str(), nullptr, nullptr) != 0) {
std::cerr << "Error: Could not open audio file." << std::endl;
return AudioSegment(); // Return an empty AudioSegment on failure
}
if (avformat_find_stream_info(format_ctx, nullptr) < 0) {
std::cerr << "Error: Could not find stream information." << std::endl;
avformat_close_input(&format_ctx);
return AudioSegment();
}
int audio_stream_index = -1;
for (unsigned int i = 0; i < format_ctx->nb_streams; i++) {
if (format_ctx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
audio_stream_index = i;
break;
}
}
if (audio_stream_index == -1) {
std::cerr << "Error: Could not find audio stream." << std::endl;
avformat_close_input(&format_ctx);
return AudioSegment();
}
AVCodecParameters* codec_par = format_ctx->streams[audio_stream_index]->codecpar;
const AVCodec* my_codec = avcodec_find_decoder(codec_par->codec_id);
AVCodecContext* codec_ctx = avcodec_alloc_context3(my_codec);
if (!codec_ctx) {
std::cerr << "Error: Could not allocate codec context." << std::endl;
avformat_close_input(&format_ctx);
return AudioSegment();
}
if (avcodec_parameters_to_context(codec_ctx, codec_par) < 0) {
std::cerr << "Error: Could not initialize codec context." << std::endl;
avcodec_free_context(&codec_ctx);
avformat_close_input(&format_ctx);
return AudioSegment();
}
if (avcodec_open2(codec_ctx, my_codec, nullptr) < 0) {
std::cerr << "Error: Could not open codec." << std::endl;
avcodec_free_context(&codec_ctx);
avformat_close_input(&format_ctx);
return AudioSegment();
}
SwrContext* swr_ctx = swr_alloc();
if (!swr_ctx) {
std::cerr << "Error: Could not allocate SwrContext." << std::endl;
avcodec_free_context(&codec_ctx);
avformat_close_input(&format_ctx);
return AudioSegment();
}
codec_ctx->sample_rate = 44100;
// Set up resampling context to convert to S16 format with 2 bytes per sample
av_opt_set_chlayout(swr_ctx, "in_chlayout", &codec_ctx->ch_layout, 0);
av_opt_set_int(swr_ctx, "in_sample_rate", codec_ctx->sample_rate, 0);
av_opt_set_sample_fmt(swr_ctx, "in_sample_fmt", codec_ctx->sample_fmt, 0);
AVChannelLayout dst_ch_layout;
av_channel_layout_copy(&dst_ch_layout, &codec_ctx->ch_layout);
av_channel_layout_uninit(&dst_ch_layout);
av_channel_layout_default(&dst_ch_layout, 2);
av_opt_set_chlayout(swr_ctx, "out_chlayout", &dst_ch_layout, 0);
av_opt_set_int(swr_ctx, "out_sample_rate", codec_ctx->sample_rate, 0); // Match input sample rate
av_opt_set_sample_fmt(swr_ctx, "out_sample_fmt", AV_SAMPLE_FMT_S16, 0); // Force S16 format
if (swr_init(swr_ctx) < 0) {
std::cerr << "Error: Failed to initialize the resampling context" << std::endl;
swr_free(&swr_ctx);
avcodec_free_context(&codec_ctx);
avformat_close_input(&format_ctx);
return AudioSegment();
}
AVPacket packet;
AVFrame* frame = av_frame_alloc();
if (!frame) {
std::cerr << "Error: Could not allocate frame." << std::endl;
swr_free(&swr_ctx);
avcodec_free_context(&codec_ctx);
avformat_close_input(&format_ctx);
return AudioSegment();
}
std::vector<char> output;
while (av_read_frame(format_ctx, &packet) >= 0) {
if (packet.stream_index == audio_stream_index) {
if (avcodec_send_packet(codec_ctx, &packet) == 0) {
while (avcodec_receive_frame(codec_ctx, frame) == 0) {
if (frame->pts != AV_NOPTS_VALUE) {
frame->pts = av_rescale_q(frame->pts, codec_ctx->time_base, format_ctx->streams[audio_stream_index]->time_base);
}
uint8_t* output_buffer;
int output_samples = av_rescale_rnd(
swr_get_delay(swr_ctx, codec_ctx->sample_rate) + frame->nb_samples,
codec_ctx->sample_rate, codec_ctx->sample_rate, AV_ROUND_UP);
int output_buffer_size = av_samples_get_buffer_size(
nullptr, 2, output_samples, AV_SAMPLE_FMT_S16, 1);
output_buffer = (uint8_t*)av_malloc(output_buffer_size);
if (output_buffer) {
memset(output_buffer, 0, output_buffer_size); // Zero padding to avoid random noise
int converted_samples = swr_convert(swr_ctx, &output_buffer, output_samples,
(const uint8_t**)frame->extended_data, frame->nb_samples);
if (converted_samples >= 0) {
output.insert(output.end(), output_buffer, output_buffer + output_buffer_size);
}
else {
std::cerr << "Error: Failed to convert audio samples." << std::endl;
}
// Make sure output_buffer is valid before freeing
if (output_buffer != nullptr) {
av_free(output_buffer);
output_buffer = nullptr; // Prevent double-free
}
}
else {
std::cerr << "Error: Could not allocate output buffer." << std::endl;
}
}
}
else {
std::cerr << "Error: Failed to send packet to codec context." << std::endl;
}
}
av_packet_unref(&packet);
}
int frame_width = av_get_bytes_per_sample(AV_SAMPLE_FMT_S16) * 2; // Use 2 bytes per sample and 2 channels
std::map<std::string, int> metadata = {
{"sample_width", 2}, // S16 format has 2 bytes per sample
{"frame_rate", codec_ctx->sample_rate}, // Use the input sample rate
{"channels", 2}, // Assuming stereo output
{"frame_width", frame_width}
};
av_frame_free(&frame);
swr_free(&swr_ctx);
avcodec_free_context(&codec_ctx);
avformat_close_input(&format_ctx);
return AudioSegment(static_cast<const char*>(output.data()), output.size(), metadata);
}
std::ofstream AudioSegment::export_segment(const std::string& out_f,
const std::string& format,
const std::string& codec,
const std::string& bitrate,
const std::vector<std::string>& parameters,
const std::map<std::string, std::string>& tags,
const std::string& id3v2_version,
const std::string& cover) {
av_log_set_level(AV_LOG_DEBUG);
AVCodecContext* codec_ctx = nullptr;
AVFormatContext* format_ctx = nullptr;
AVStream* stream = nullptr;
AVFrame* frame = nullptr;
AVPacket* pkt = nullptr;
SwrContext* swr_ctx = nullptr;
int ret;
// Initialize format context
if (avformat_alloc_output_context2(&format_ctx, nullptr, format.c_str(), out_f.c_str()) < 0) {
throw std::runtime_error("Could not allocate format context.");
}
// Find encoder
const AVCodec* codec_ptr = avcodec_find_encoder_by_name(codec.c_str());
if (!codec_ptr) {
throw std::runtime_error("Codec not found.");
}
// Add stream
stream = avformat_new_stream(format_ctx, codec_ptr);
if (!stream) {
throw std::runtime_error("Failed to create new stream.");
}
// Allocate codec context
codec_ctx = avcodec_alloc_context3(codec_ptr);
if (!codec_ctx) {
throw std::runtime_error("Could not allocate audio codec context.");
}
// Set codec parameters
codec_ctx->bit_rate = std::stoi(bitrate);
codec_ctx->sample_rate = this->get_frame_rate(); // Ensure this returns the correct sample rate
av_channel_layout_default(&codec_ctx->ch_layout, 2);
codec_ctx->sample_fmt = codec_ptr->sample_fmts ? codec_ptr->sample_fmts[0] : AV_SAMPLE_FMT_FLTP;
// Open codec
if (avcodec_open2(codec_ctx, codec_ptr, nullptr) < 0) {
throw std::runtime_error("Could not open codec.");
}
// Set codec parameters to the stream
if (avcodec_parameters_from_context(stream->codecpar, codec_ctx) < 0) {
throw std::runtime_error("Could not initialize stream codec parameters.");
}
// Open output file
std::ofstream out_file(out_f, std::ios::binary);
if (!out_file) {
throw std::runtime_error("Failed to open output file.");
}
if (!(format_ctx->oformat->flags & AVFMT_NOFILE)) {
if (avio_open(&format_ctx->pb, out_f.c_str(), AVIO_FLAG_WRITE) < 0) {
throw std::runtime_error("Could not open output file.");
}
}
// Write file header
if (avformat_write_header(format_ctx, nullptr) < 0) {
throw std::runtime_error("Error occurred when opening output file.");
}
// Initialize packet
pkt = av_packet_alloc();
if (!pkt) {
throw std::runtime_error("Could not allocate AVPacket.");
}
// Initialize frame
frame = av_frame_alloc();
if (!frame) {
throw std::runtime_error("Could not allocate AVFrame.");
}
frame->nb_samples = codec_ctx->frame_size;
frame->format = codec_ctx->sample_fmt;
frame->ch_layout = codec_ctx->ch_layout;
// Allocate data buffer
if (av_frame_get_buffer(frame, 0) < 0) {
throw std::runtime_error("Could not allocate audio data buffers.");
}
// Initialize SwrContext for resampling
swr_ctx = swr_alloc();
if (!swr_ctx) {
throw std::runtime_error("Could not allocate SwrContext.");
}
// Set options for resampling
av_opt_set_chlayout(swr_ctx, "in_chlayout", &codec_ctx->ch_layout, 0);
av_opt_set_chlayout(swr_ctx, "out_chlayout", &codec_ctx->ch_layout, 0);
av_opt_set_int(swr_ctx, "in_sample_rate", codec_ctx->sample_rate, 0);
av_opt_set_int(swr_ctx, "out_sample_rate", codec_ctx->sample_rate, 0);
av_opt_set_sample_fmt(swr_ctx, "in_sample_fmt", AV_SAMPLE_FMT_S16, 0); // Assuming input is S16
av_opt_set_sample_fmt(swr_ctx, "out_sample_fmt", codec_ctx->sample_fmt, 0);
// Initialize the resampling context
if (swr_init(swr_ctx) < 0) {
throw std::runtime_error("Failed to initialize SwrContext.");
}
int samples_read = 0;
int total_samples = data_.size() / (av_get_bytes_per_sample(AV_SAMPLE_FMT_S16) * 2); // Assuming input is stereo
while (samples_read < total_samples) {
if (av_frame_make_writable(frame) < 0) {
throw std::runtime_error("Frame not writable.");
}
int num_samples = std::min(codec_ctx->frame_size, total_samples - samples_read);
// Prepare input data
const uint8_t* input_data[2] = { reinterpret_cast<const uint8_t*>(data_.data() + samples_read * av_get_bytes_per_sample(AV_SAMPLE_FMT_S16) * 2), nullptr };
int output_samples = swr_convert(swr_ctx, frame->data, frame->nb_samples,
input_data, num_samples);
if (output_samples < 0) {
throw std::runtime_error("Error converting audio samples.");
}
frame->nb_samples = output_samples;
// Send the frame for encoding
if (avcodec_send_frame(codec_ctx, frame) < 0) {
throw std::runtime_error("Error sending frame for encoding.");
}
// Receive and write packets
while (avcodec_receive_packet(codec_ctx, pkt) >= 0) {
out_file.write(reinterpret_cast<char*>(pkt->data), pkt->size);
av_packet_unref(pkt);
}
samples_read += num_samples;
}
// Flush the encoder
if (avcodec_send_frame(codec_ctx, nullptr) < 0) {
throw std::runtime_error("Error flushing the encoder.");
}
while (avcodec_receive_packet(codec_ctx, pkt) >= 0) {
out_file.write(reinterpret_cast<char*>(pkt->data), pkt->size);
av_packet_unref(pkt);
}
// Write file trailer
av_write_trailer(format_ctx);
// Cleanup
av_frame_free(&frame);
av_packet_free(&pkt);
swr_free(&swr_ctx);
avcodec_free_context(&codec_ctx);
if (!(format_ctx->oformat->flags & AVFMT_NOFILE)) {
avio_closep(&format_ctx->pb);
}
avformat_free_context(format_ctx);
out_file.close();
return out_file;
}
//declaration
/*
std::ofstream export_segment(const std::string& out_f,
const std::string& format = "mp3",
const std::string& codec = "libmp3lame",
const std::string& bitrate = "128000",
const std::vector<std::string>& parameters = {},
const std::map<std::string, std::string>& tags = {},
const std::string& id3v2_version = "4",
const std::string& cover = "");
*/
This code only works for mp3 format. I also want to export to aac,ogg,flv,wav and any other popular formats.
1