AudioSegment AudioSegment::from_file(const std::string& file_path, const std::string& format, const std::string& codec,
const std::map<std::string, int>& parameters, int start_second, int duration) {
avformat_network_init();
av_log_set_level(AV_LOG_ERROR);
AVFormatContext* format_ctx = nullptr;
if (avformat_open_input(&format_ctx, file_path.c_str(), nullptr, nullptr) != 0) {
std::cerr << "Error: Could not open audio file." << std::endl;
return AudioSegment();
}
if (avformat_find_stream_info(format_ctx, nullptr) < 0) {
std::cerr << "Error: Could not find stream information." << std::endl;
avformat_close_input(&format_ctx);
return AudioSegment();
}
int audio_stream_index = -1;
for (unsigned int i = 0; i < format_ctx->nb_streams; i++) {
if (format_ctx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
audio_stream_index = i;
break;
}
}
if (audio_stream_index == -1) {
std::cerr << "Error: Could not find audio stream." << std::endl;
avformat_close_input(&format_ctx);
return AudioSegment();
}
AVCodecParameters* codec_par = format_ctx->streams[audio_stream_index]->codecpar;
const AVCodec* my_codec = avcodec_find_decoder(codec_par->codec_id);
AVCodecContext* codec_ctx = avcodec_alloc_context3(my_codec);
if (!codec_ctx) {
std::cerr << "Error: Could not allocate codec context." << std::endl;
avformat_close_input(&format_ctx);
return AudioSegment();
}
if (avcodec_parameters_to_context(codec_ctx, codec_par) < 0) {
std::cerr << "Error: Could not initialize codec context." << std::endl;
avcodec_free_context(&codec_ctx);
avformat_close_input(&format_ctx);
return AudioSegment();
}
if (avcodec_open2(codec_ctx, my_codec, nullptr) < 0) {
std::cerr << "Error: Could not open codec." << std::endl;
avcodec_free_context(&codec_ctx);
avformat_close_input(&format_ctx);
return AudioSegment();
}
SwrContext* swr_ctx = swr_alloc();
if (!swr_ctx) {
std::cerr << "Error: Could not allocate SwrContext." << std::endl;
avcodec_free_context(&codec_ctx);
avformat_close_input(&format_ctx);
return AudioSegment();
}
codec_ctx->sample_rate = 44100;
av_opt_set_chlayout(swr_ctx, "in_chlayout", &codec_ctx->ch_layout, 0);
av_opt_set_int(swr_ctx, "in_sample_rate", codec_ctx->sample_rate, 0);
av_opt_set_sample_fmt(swr_ctx, "in_sample_fmt", codec_ctx->sample_fmt, 0);
AVChannelLayout dst_ch_layout;
av_channel_layout_copy(&dst_ch_layout, &codec_ctx->ch_layout);
av_channel_layout_uninit(&dst_ch_layout);
av_channel_layout_default(&dst_ch_layout, 2);
av_opt_set_chlayout(swr_ctx, "out_chlayout", &dst_ch_layout, 0);
av_opt_set_int(swr_ctx, "out_sample_rate", codec_ctx->sample_rate, 0);
av_opt_set_sample_fmt(swr_ctx, "out_sample_fmt", AV_SAMPLE_FMT_S16, 0);
if (swr_init(swr_ctx) < 0) {
std::cerr << "Error: Failed to initialize the resampling context" << std::endl;
swr_free(&swr_ctx);
avcodec_free_context(&codec_ctx);
avformat_close_input(&format_ctx);
return AudioSegment();
}
AVPacket packet;
AVFrame* frame = av_frame_alloc();
if (!frame) {
std::cerr << "Error: Could not allocate frame." << std::endl;
swr_free(&swr_ctx);
avcodec_free_context(&codec_ctx);
avformat_close_input(&format_ctx);
return AudioSegment();
}
std::vector<char> output;
bool error_happened = false;
while (av_read_frame(format_ctx, &packet) >= 0) {
if (packet.stream_index == audio_stream_index) {
if (avcodec_send_packet(codec_ctx, &packet) == 0) {
while (true) {
int ret = avcodec_receive_frame(codec_ctx, frame);
if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
break;
}
else if (ret < 0) {
error_happened = true;
std::cerr << "Error: Failed to decode audio frame (code: " << ret << ")." << std::endl;
break;
}
if (frame->pkt_size < 4) {
std::cerr << "Error: Invalid PCM packet, expected at least 4 bytes, but got "
<< frame->pkt_size << " bytes." << std::endl;
continue;
}
// Rescale PTS
if (frame->pts != AV_NOPTS_VALUE) {
frame->pts = av_rescale_q(frame->pts, codec_ctx->time_base,
format_ctx->streams[audio_stream_index]->time_base);
}
uint8_t* output_buffer;
int output_samples = av_rescale_rnd(
swr_get_delay(swr_ctx, codec_ctx->sample_rate) + frame->nb_samples,
codec_ctx->sample_rate, codec_ctx->sample_rate, AV_ROUND_UP);
int output_buffer_size = av_samples_get_buffer_size(
nullptr, 2, output_samples, AV_SAMPLE_FMT_S16, 1);
output_buffer = (uint8_t*)av_malloc(output_buffer_size);
if (output_buffer) {
memset(output_buffer, 0, output_buffer_size);
int converted_samples = swr_convert(swr_ctx, &output_buffer, output_samples,
(const uint8_t**)frame->extended_data, frame->nb_samples);
if (converted_samples >= 0) {
int final_size = av_samples_get_buffer_size(nullptr, 2, converted_samples, AV_SAMPLE_FMT_S16, 1);
output.insert(output.end(), output_buffer, output_buffer + final_size);
}
else {
std::cerr << "Error: Failed to convert audio samples." << std::endl;
}
av_free(output_buffer);
}
else {
std::cerr << "Error: Could not allocate output buffer." << std::endl;
}
}
}
else {
std::cerr << "Error: Failed to send packet to codec context." << std::endl;
}
}
av_packet_unref(&packet);
}
// Handle the last frame if not handled within the loop
if (!error_happened && frame->nb_samples > 0) {
uint8_t* output_buffer;
int output_samples = av_rescale_rnd(
swr_get_delay(swr_ctx, codec_ctx->sample_rate) + frame->nb_samples,
codec_ctx->sample_rate, codec_ctx->sample_rate, AV_ROUND_UP);
int output_buffer_size = av_samples_get_buffer_size(
nullptr, 2, output_samples, AV_SAMPLE_FMT_S16, 1);
output_buffer = (uint8_t*)av_malloc(output_buffer_size);
if (output_buffer) {
memset(output_buffer, 0, output_buffer_size);
int converted_samples = swr_convert(swr_ctx, &output_buffer, output_samples,
(const uint8_t**)frame->extended_data, frame->nb_samples);
if (converted_samples >= 0) {
int final_size = av_samples_get_buffer_size(nullptr, 2, converted_samples, AV_SAMPLE_FMT_S16, 1);
output.insert(output.end(), output_buffer, output_buffer + final_size);
}
else {
std::cerr << "Error: Failed to convert audio samples." << std::endl;
}
av_free(output_buffer);
}
else {
std::cerr << "Error: Could not allocate output buffer." << std::endl;
}
}
int frame_width = av_get_bytes_per_sample(AV_SAMPLE_FMT_S16) * 2;
std::map<std::string, int> metadata = {
{"sample_width", 2},
{"frame_rate", codec_ctx->sample_rate},
{"channels", 2},
{"frame_width", frame_width}
};
av_frame_free(&frame);
swr_free(&swr_ctx);
avcodec_free_context(&codec_ctx);
avformat_close_input(&format_ctx);
return AudioSegment(static_cast<const char*>(output.data()), output.size(), metadata);
}
void log_ffmpeg_error(int ret) {
char errbuf[AV_ERROR_MAX_STRING_SIZE];
av_make_error_string(errbuf, AV_ERROR_MAX_STRING_SIZE, ret);
std::cerr << "FFmpeg error: " << errbuf << std::endl;
}
std::ofstream AudioSegment::export_segment_to_ogg(const std::string& out_f) {
av_log_set_level(AV_LOG_DEBUG);
AVFormatContext* format_ctx = nullptr;
AVCodecContext* codec_ctx = nullptr;
AVStream* stream = nullptr;
const AVCodec* codec = nullptr;
AVPacket* pkt = av_packet_alloc();
AVFrame* frame = av_frame_alloc();
int ret;
if (!pkt || !frame) {
std::cerr << "Error: Could not allocate packet or frame." << std::endl;
return std::ofstream();
}
std::ofstream out_file(out_f, std::ios::binary);
if (!out_file.is_open()) {
std::cerr << "Error: Could not open output file." << std::endl;
return std::ofstream();
}
ret = avformat_alloc_output_context2(&format_ctx, nullptr, "ogg", out_f.c_str());
if (ret < 0 || !format_ctx) {
std::cerr << "Error: Could not allocate output format context." << std::endl;
log_ffmpeg_error(ret);
return std::ofstream();
}
codec = avcodec_find_encoder_by_name("libvorbis");
if (!codec) {
std::cerr << "Error: Could not find Vorbis codec." << std::endl;
avformat_free_context(format_ctx);
return std::ofstream();
}
stream = avformat_new_stream(format_ctx, codec);
if (!stream) {
std::cerr << "Error: Could not create new stream." << std::endl;
avformat_free_context(format_ctx);
return std::ofstream();
}
codec_ctx = avcodec_alloc_context3(codec);
if (!codec_ctx) {
std::cerr << "Error: Could not allocate codec context." << std::endl;
avformat_free_context(format_ctx);
return std::ofstream();
}
codec_ctx->sample_rate = 48000;
codec_ctx->ch_layout = AV_CHANNEL_LAYOUT_STEREO;
codec_ctx->bit_rate = 128000;
codec_ctx->sample_fmt = codec->sample_fmts ? codec->sample_fmts[0] : AV_SAMPLE_FMT_FLTP;
codec_ctx->time_base = { 1, codec_ctx->sample_rate };
stream->time_base = codec_ctx->time_base;
ret = avcodec_parameters_from_context(stream->codecpar, codec_ctx);
if (ret < 0) {
std::cerr << "Error: Could not copy codec parameters to stream." << std::endl;
log_ffmpeg_error(ret);
avcodec_free_context(&codec_ctx);
avformat_free_context(format_ctx);
return std::ofstream();
}
ret = avcodec_open2(codec_ctx, codec, nullptr);
if (ret < 0) {
std::cerr << "Error: Could not open codec." << std::endl;
log_ffmpeg_error(ret);
avcodec_free_context(&codec_ctx);
avformat_free_context(format_ctx);
return std::ofstream();
}
if (!(format_ctx->oformat->flags & AVFMT_NOFILE)) {
ret = avio_open(&format_ctx->pb, out_f.c_str(), AVIO_FLAG_WRITE);
if (ret < 0) {
std::cerr << "Error: Could not open file for writing." << std::endl;
log_ffmpeg_error(ret);
avcodec_free_context(&codec_ctx);
avformat_free_context(format_ctx);
return std::ofstream();
}
}
ret = avformat_write_header(format_ctx, nullptr);
if (ret < 0) {
std::cerr << "Error: Could not write file header." << std::endl;
log_ffmpeg_error(ret);
avio_closep(&format_ctx->pb);
avcodec_free_context(&codec_ctx);
avformat_free_context(format_ctx);
return std::ofstream();
}
int total_samples = data_.size() / (av_get_bytes_per_sample(AV_SAMPLE_FMT_S16) * codec_ctx->ch_layout.nb_channels);
int samples_read = 0;
codec_ctx->frame_size = 1024;
frame->nb_samples = codec_ctx->frame_size;
while (samples_read < total_samples) {
int num_samples = std::min(codec_ctx->frame_size, total_samples - samples_read);
frame->nb_samples = num_samples;
ret = av_frame_get_buffer(frame, 0);
if (ret < 0) {
std::cerr << "Error: Could not allocate audio data buffers." << std::endl;
log_ffmpeg_error(ret);
avcodec_free_context(&codec_ctx);
avformat_free_context(format_ctx);
return std::ofstream();
}
std::memcpy(frame->data[0],
data_.data() + samples_read * av_get_bytes_per_sample(AV_SAMPLE_FMT_S16) * codec_ctx->ch_layout.nb_channels,
num_samples * av_get_bytes_per_sample(AV_SAMPLE_FMT_S16) * codec_ctx->ch_layout.nb_channels);
if (num_samples < codec_ctx->frame_size) {
int padding_size = codec_ctx->frame_size - num_samples;
std::memset(frame->data[0] + num_samples * av_get_bytes_per_sample(AV_SAMPLE_FMT_S16) * codec_ctx->ch_layout.nb_channels,
0,
padding_size * av_get_bytes_per_sample(AV_SAMPLE_FMT_S16) * codec_ctx->ch_layout.nb_channels);
frame->nb_samples = codec_ctx->frame_size;
}
frame->pts = av_rescale_q(samples_read, { 1, codec_ctx->sample_rate }, stream->time_base);
ret = avcodec_send_frame(codec_ctx, frame);
if (ret < 0) {
std::cerr << "Error: Error sending frame for encoding." << std::endl;
log_ffmpeg_error(ret);
avcodec_free_context(&codec_ctx);
avformat_free_context(format_ctx);
return std::ofstream();
}
while (ret >= 0) {
ret = avcodec_receive_packet(codec_ctx, pkt);
if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
break;
}
else if (ret < 0) {
std::cerr << "Error: Error receiving packet." << std::endl;
log_ffmpeg_error(ret);
avcodec_free_context(&codec_ctx);
avformat_free_context(format_ctx);
return std::ofstream();
}
pkt->pts = pkt->dts = av_rescale_q(pkt->pts, codec_ctx->time_base, stream->time_base);
pkt->stream_index = stream->index;
if (av_interleaved_write_frame(format_ctx, pkt) < 0) {
std::cerr << "Error: Error writing packet." << std::endl;
log_ffmpeg_error(ret);
avcodec_free_context(&codec_ctx);
avformat_free_context(format_ctx);
return std::ofstream();
}
av_packet_unref(pkt);
}
samples_read += num_samples;
}
avcodec_send_frame(codec_ctx, nullptr);
while (avcodec_receive_packet(codec_ctx, pkt) == 0) {
pkt->pts = pkt->dts = av_rescale_q(pkt->pts, codec_ctx->time_base, stream->time_base);
pkt->stream_index = stream->index;
if (av_interleaved_write_frame(format_ctx, pkt) < 0) {
std::cerr << "Error: Error writing packet." << std::endl;
log_ffmpeg_error(ret);
avcodec_free_context(&codec_ctx);
avformat_free_context(format_ctx);
return std::ofstream();
}
av_packet_unref(pkt);
}
av_write_trailer(format_ctx);
av_frame_free(&frame);
av_packet_free(&pkt);
avcodec_free_context(&codec_ctx);
if (format_ctx->pb) {
avio_closep(&format_ctx->pb);
}
avformat_free_context(format_ctx);
return out_file;
}
Errors:
[file @ 000001b3b32640c0] Setting default whitelist 'file,crypto,data'
[ogg @ 000001b3b31d4a00] No extradata present
Error: Could not write file header.
FFmpeg error: Invalid data found when processing input
[AVIOContext @ 000001b3b4aa3d80] Statistics: 0 bytes written, 0 seeks, 0 writeouts