Update buffer size and overlap size in whisper-processing.h and defau…

…lt buffer size in msec in transcription-filter.cpp
occ-ai · royshil · May 2, 2024 · Apr 30, 2024 · Apr 30, 2024 · May 2, 2024
commit 4f38b7e2e11b52da0203a4e456007586a29d43f5
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
@@ -189,7 +189,24 @@ void set_text_callback(struct transcription_filter_data *gf,
 	std::string str_copy = fix_utf8(result.text);
 	str_copy = remove_leading_trailing_nonalpha(str_copy);
 
-	if (gf->translate) {
+	// if suppression is enabled, check if the text is in the suppression list
+	if (!gf->suppress_sentences.empty()) {
+		// split the suppression list by newline into individual sentences
+		std::vector<std::string> suppress_sentences_list =
+			split(gf->suppress_sentences, '\n');
+		// check if the text is in the suppression list
+		for (const std::string &suppress_sentence : suppress_sentences_list) {
+			if (str_copy == suppress_sentence) {
+				obs_log(gf->log_level, "Suppressed sentence: '%s'",
+					str_copy.c_str());
+				gf->last_text = str_copy;
+				return; // do not process the sentence
+			}
+		}
+	}
+
+	if (gf->translate && !str_copy.empty() && str_copy != gf->last_text &&
+	    result.result == DETECTION_RESULT_SPEECH) {
 		obs_log(gf->log_level, "Translating text. %s -> %s", gf->source_lang.c_str(),
 			gf->target_lang.c_str());
 		std::string translated_text;

diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp
@@ -287,15 +287,18 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 			if (token.id >= 50256) {
 				keep = false;
 			}
-			if ((j == n_tokens - 2 || j == n_tokens - 3) && token.p < 0.5) {
+			if (j == n_tokens - 2 && token.p < 0.5) {
+				keep = false;
+			}
+			if (j == n_tokens - 3 && token.p < 0.4) {
 				keep = false;
 			}
 			// if the second to last token is .id == 13 ('.'), don't keep it
 			if (j == n_tokens - 2 && token.id == 13) {
 				keep = false;
 			}
 			// token ids https://huggingface.co/openai/whisper-large-v3/raw/main/tokenizer.json
-			if (token.id > 50540 && token.id <= 51865) {
+			if (token.id > 50566 && token.id <= 51865) {
 				obs_log(gf->log_level,
 					"Large time token found (%d), this shouldn't happen",
 					token.id);
@@ -315,20 +318,6 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 		obs_log(gf->log_level, "Decoded sentence: '%s'", text.c_str());
 		obs_log(gf->log_level, "Token IDs: %s", tokenIds.c_str());
 
-		// if suppression is enabled, check if the text is in the suppression list
-		if (!gf->suppress_sentences.empty()) {
-			// split the suppression list by newline into individual sentences
-			std::vector<std::string> suppress_sentences_list =
-				split(gf->suppress_sentences, '\n');
-			// check if the text is in the suppression list
-			for (const std::string &suppress_sentence : suppress_sentences_list) {
-				if (text.find(suppress_sentence) != std::string::npos) {
-					obs_log(gf->log_level, "Suppressed sentence: '%s'",
-						text.c_str());
-					return {DETECTION_RESULT_UNKNOWN, "", 0, 0, {}};
-				}
-			}
-		}
 		if (gf->log_words) {
 			obs_log(LOG_INFO, "[%s --> %s] (%.3f) %s", to_timestamp(t0).c_str(),
 				to_timestamp(t1).c_str(), sentence_p, text.c_str());
@@ -346,7 +335,7 @@ void process_audio_from_buffer(struct transcription_filter_data *gf)
 {
 	uint32_t num_new_frames_from_infos = 0;
 	uint64_t start_timestamp = 0;
-	bool last_step_in_segment = false;
+	bool save_overlap_region = true;
 
 	{
 		// scoped lock the buffer mutex
@@ -355,6 +344,10 @@ void process_audio_from_buffer(struct transcription_filter_data *gf)
 		// We need (gf->frames - gf->last_num_frames) new frames for a full segment,
 		const size_t remaining_frames_to_full_segment = gf->frames - gf->last_num_frames;
 
+		obs_log(gf->log_level,
+			"processing audio from buffer, %lu existing frames, %lu frames needed to full segment (%d frames)",
+			gf->last_num_frames, remaining_frames_to_full_segment, gf->frames);
+
 		// pop infos from the info buffer and mark the beginning timestamp from the first
 		// info as the beginning timestamp of the segment
 		struct transcription_filter_audio_info info_from_buf = {0};
@@ -371,14 +364,12 @@ void process_audio_from_buffer(struct transcription_filter_data *gf)
 				num_new_frames_from_infos -= info_from_buf.frames;
 				circlebuf_push_front(&gf->info_buffer, &info_from_buf,
 						     size_of_audio_info);
-				// this is the final step in the segment
-				last_step_in_segment = true;
 				break;
 			}
 		}
 
 		obs_log(gf->log_level,
-			"with %lu remaining to full segment, popped %d info-frames, pushing at %lu (overlap)",
+			"with %lu remaining to full segment, popped %d frames from info buffer, pushed at %lu (overlap)",
 			remaining_frames_to_full_segment, num_new_frames_from_infos,
 			gf->last_num_frames);
 
@@ -392,24 +383,17 @@ void process_audio_from_buffer(struct transcription_filter_data *gf)
 	}
 
 	if (gf->last_num_frames > 0) {
+		obs_log(gf->log_level, "full segment, %lu frames overlap, %lu frames to process",
+			gf->last_num_frames, gf->last_num_frames + num_new_frames_from_infos);
 		gf->last_num_frames += num_new_frames_from_infos;
-		if (!last_step_in_segment) {
-			// Mid-segment process
-			obs_log(gf->log_level, "mid-segment, now %d frames left to full segment",
-				(int)(gf->frames - gf->last_num_frames));
-		} else {
-			// Final step in segment
-			obs_log(gf->log_level, "full segment, %d frames to process",
-				(int)(gf->last_num_frames));
-		}
 	} else {
 		gf->last_num_frames = num_new_frames_from_infos;
-		obs_log(gf->log_level, "first segment, no overlap exists, %d frames to process",
-			(int)(gf->last_num_frames));
+		obs_log(gf->log_level, "first segment, no overlap exists, %lu frames to process",
+			gf->last_num_frames);
 	}
 
-	obs_log(gf->log_level, "processing %d frames (%d ms), start timestamp %llu ",
-		(int)gf->last_num_frames, (int)(gf->last_num_frames * 1000 / gf->sample_rate),
+	obs_log(gf->log_level, "processing %lu frames (%d ms), start timestamp %llu",
+		gf->last_num_frames, (int)(gf->last_num_frames * 1000.0f / gf->sample_rate),
 		start_timestamp);
 
 	// time the audio processing
@@ -442,44 +426,38 @@ void process_audio_from_buffer(struct transcription_filter_data *gf)
 				resampled_16khz_frames);
 			skipped_inference = true;
 			// prevent copying the buffer to the beginning (overlap)
-			gf->last_num_frames = 0;
-			last_step_in_segment = false;
+			save_overlap_region = false;
 		} else {
-			speech_start_frame = (stamps[0].start < 3000) ? 0 : stamps[0].start;
+			// if the vad finds that start within the first 10% of the buffer, set the start to 0
+			speech_start_frame = (stamps[0].start < (int)(resampled_16khz_frames / 10))
+						     ? 0
+						     : stamps[0].start;
 			speech_end_frame = stamps.back().end;
 			uint32_t number_of_frames = speech_end_frame - speech_start_frame;
 
+			// if the speech is pressed up against the end of the buffer
+			// apply the overlapped region, else don't
+			save_overlap_region = (speech_end_frame == resampled_16khz_frames);
+
 			obs_log(gf->log_level,
 				"VAD detected speech from %d to %d (%d frames, %d ms)",
 				speech_start_frame, speech_end_frame, number_of_frames,
 				number_of_frames * 1000 / WHISPER_SAMPLE_RATE);
 
-			// if the speech segment is less than 1 second - put the audio back into the buffer
-			// to be handled in the next iteration
+			// if the speech is less than 1 second - pad with zeros and send for inference
 			if (number_of_frames > 0 && number_of_frames < WHISPER_SAMPLE_RATE) {
-				// convert speech_start_frame and speech_end_frame to original sample rate
-				speech_start_frame =
-					speech_start_frame * gf->sample_rate / WHISPER_SAMPLE_RATE;
-				speech_end_frame =
-					speech_end_frame * gf->sample_rate / WHISPER_SAMPLE_RATE;
-				number_of_frames = speech_end_frame - speech_start_frame;
-
-				// use memmove to copy the speech segment to the beginning of the buffer
-				for (size_t c = 0; c < gf->channels; c++) {
-					memmove(gf->copy_buffers[c],
-						gf->copy_buffers[c] + speech_start_frame,
-						number_of_frames * sizeof(float));
-				}
-
 				obs_log(gf->log_level,
-					"Speech segment is less than 1 second, moving %d to %d (len %d) to buffer start",
-					speech_start_frame, speech_end_frame, number_of_frames);
-				// no processing of the segment
-				skipped_inference = true;
-				// reset the last_num_frames to the number of frames in the buffer
-				gf->last_num_frames = number_of_frames;
-				// prevent copying the buffer to the beginning (overlap)
-				last_step_in_segment = false;
+					"Speech segment is less than 1 second, padding with zeros to 1 second");
+				// copy the speech segment to the beginning of the resampled buffer
+				// use memmove to copy the speech segment to the beginning of the buffer
+				memmove(resampled_16khz[0], resampled_16khz[0] + speech_start_frame,
+					number_of_frames * sizeof(float));
+				// zero out the rest of the buffer
+				memset(resampled_16khz[0] + number_of_frames, 0,
+				       (WHISPER_SAMPLE_RATE - number_of_frames) * sizeof(float));
+
+				speech_start_frame = 0;
+				speech_end_frame = WHISPER_SAMPLE_RATE;
 			}
 		}
 	}
@@ -511,24 +489,29 @@ void process_audio_from_buffer(struct transcription_filter_data *gf)
 	obs_log(gf->log_level, "audio processing of %lu ms data took %d ms", last_num_frames_ms,
 		(int)duration);
 
-	if (last_step_in_segment) {
+	if (save_overlap_region) {
 		const uint64_t overlap_size_ms =
 			(uint64_t)(gf->overlap_frames * 1000 / gf->sample_rate);
 		obs_log(gf->log_level,
-			"copying %lu frames (%lu ms) from the end of the buffer (pos %lu) to the beginning",
+			"copying %lu overlap frames (%lu ms) from the end of the buffer (pos %lu) to the beginning",
 			gf->overlap_frames, overlap_size_ms,
 			gf->last_num_frames - gf->overlap_frames);
 		for (size_t c = 0; c < gf->channels; c++) {
-			// This is the last step in the segment - reset the copy buffer (include overlap frames)
+			// zero out the copy buffer, just in case
+			memset(gf->copy_buffers[c], 0, gf->frames * sizeof(float));
 			// move overlap frames from the end of the last copy_buffers to the beginning
-			memcpy(gf->copy_buffers[c],
-			       gf->copy_buffers[c] + gf->last_num_frames - gf->overlap_frames,
-			       gf->overlap_frames * sizeof(float));
-			// zero out the rest of the buffer, just in case
-			memset(gf->copy_buffers[c] + gf->overlap_frames, 0,
-			       (gf->frames - gf->overlap_frames) * sizeof(float));
+			memmove(gf->copy_buffers[c],
+				gf->copy_buffers[c] + gf->last_num_frames - gf->overlap_frames,
+				gf->overlap_frames * sizeof(float));
 		}
 		gf->last_num_frames = gf->overlap_frames;
+	} else {
+		obs_log(gf->log_level, "no overlap needed. zeroing out the copy buffer");
+		// zero out the copy buffer, just in case
+		for (size_t c = 0; c < gf->channels; c++) {
+			memset(gf->copy_buffers[c], 0, gf->frames * sizeof(float));
+		}
+		gf->last_num_frames = 0;
 	}
 }
 

diff --git a/src/whisper-utils/whisper-processing.h b/src/whisper-utils/whisper-processing.h
@@ -6,9 +6,9 @@
 // buffer size in msec
 #define DEFAULT_BUFFER_SIZE_MSEC 3000
 // overlap in msec
-#define DEFAULT_OVERLAP_SIZE_MSEC 100
+#define DEFAULT_OVERLAP_SIZE_MSEC 150
 #define MAX_OVERLAP_SIZE_MSEC 1000
-#define MIN_OVERLAP_SIZE_MSEC 100
+#define MIN_OVERLAP_SIZE_MSEC 150
 
 enum DetectionResult {
 	DETECTION_RESULT_UNKNOWN = 0,