sul-dlss · peetucket · Jan 27, 2025 · Dec 9, 2024 · Dec 10, 2024 · Dec 10, 2024
diff --git a/lib/dor/text_extraction/speech_to_text.rb b/lib/dor/text_extraction/speech_to_text.rb
@@ -52,10 +52,53 @@ def cleanup_s3_folder
       # return a list of filenames that should be stt'd
       # iterate over all files in cocina_object.structural.contains, looking at mimetypes
       # return a list of filenames that are correct mimetype
+      # then filter out any files that either (1) do not have an audio track or (2) have audio that is mostly silent
       def filenames_to_stt
-        stt_files.map(&:filename)
+        available_files = stt_files.map(&:filename)
+        available_files.select { |filename| has_useful_audio_track?(filename) }
       end
 
+      # first verify that the file has an audio track, then check the audio metadata to determine if the audio is mostly silent
+      # using technical metadata generated in https://github.com/sul-dlss/technical-metadata-service/pull/572
+      # check the audio max_volume and mean_volume fields to determine if the audio is mostly silent
+      # if will raise an error if this metadata is missing
+      # rubocop:disable Metrics/CyclomaticComplexity
+      # rubocop:disable Metrics/PerceivedComplexity
+      # rubocop:disable Metrics/AbcSize
+      def has_useful_audio_track?(filename)
+        return false unless file_level_tech_metadata(filename)&.dig('av_metadata', 'audio_count')&.positive?
+
+        audio_metadata = file_level_tech_metadata(filename)&.dig('dro_file_parts')&.find { |parts| parts['part_type'] == 'audio' }&.dig('audio_metadata')
+
+        raise "No audio metadata found for #{filename}" unless audio_metadata
+        raise "Audio metadata missing max_volume and mean_volume for #{filename}" unless audio_metadata['max_volume'] && audio_metadata['mean_volume']
+
+        audio_metadata['mean_volume'] > -40 && audio_metadata['max_volume'] > -30
+      end
+      # rubocop:enable Metrics/CyclomaticComplexity
+      # rubocop:enable Metrics/PerceivedComplexity
+      # rubocop:enable Metrics/AbcSize
+
+      # return the technical metadata for a given filename
+      def file_level_tech_metadata(filename)
+        tech_metadata.find { |file| file['filename'] == filename }
+      end
+
+      # return the technical metadata for the object from the technical-metadata-service and parse it as json
+      # rubocop:disable Metrics/AbcSize
+      def tech_metadata
+        @tech_metadata ||= begin
+          resp = Faraday.get("#{Settings.tech_md_service.url}/v1/technical-metadata/druid/#{cocina_object.externalIdentifier}") do |req|
+            req.headers['Content-Type'] = 'application/json'
+            req.headers['Authorization'] = "Bearer #{Settings.tech_md_service.token}"
+          end
+          raise "Technical-metadata-service returned #{resp.status} when requesting techmd for #{bare_druid}: #{resp.body}" unless resp.success?
+
+          JSON.parse(resp.body)
+        end
+      end
+      # rubocop:enable Metrics/AbcSize
+
       # return the s3 location for a given filename
       def s3_location(filename)
         File.join(job_id, filename)

diff --git a/lib/robots/dor_repo/speech_to_text/start_stt.rb b/lib/robots/dor_repo/speech_to_text/start_stt.rb
@@ -10,9 +10,6 @@ def initialize
         end
 
         def perform_work
-          # TODO: Note that the `possible?` method is not complete until we further refine the mimetypes available
-          # see https://github.com/sul-dlss/common-accessioning/issues/1346
-          # and lib/dor/text_extraction/speech_to_text.rb#allowed_mimetypes
           if Dor::TextExtraction::SpeechToText.new(cocina_object:).possible?
             return if object_client.version.status.open?
 

diff --git a/spec/fixtures/technical_metadata/bc123df4567.json b/spec/fixtures/technical_metadata/bc123df4567.json
@@ -0,0 +1,146 @@
+[
+    {
+        "druid": "druid:bc123df4567",
+        "filename": "file1.m4a",
+        "filetype": "fmt/199",
+        "mimetype": "audio/mp4",
+        "bytes": 4887996,
+        "file_modification": "2024-12-04T00:04:55.328Z",
+        "av_metadata": {
+            "format": "MPEG-4",
+            "codec_id": "M4A ",
+            "duration": 297.13,
+            "audio_count": 1,
+            "file_extension": "m4a",
+            "format_profile": "Apple audio with iTunes info"
+        },
+        "dro_file_parts": [
+            {
+                "part_type": "audio",
+                "part_id": "1",
+                "order": 0,
+                "format": "AAC",
+                "audio_metadata": {
+                    "channels": "2",
+                    "codec_id": "mp4a-40-2",
+                    "stream_size": 4831404,
+                    "sampling_rate": 48000,
+                    "mean_volume": -35,
+                    "max_volume": -25
+                }
+            }
+        ]
+    },
+    {
+        "druid": "druid:bc123df4567",
+        "filename": "file1.mp4",
+        "filetype": "fmt/199",
+        "mimetype": "video/mp4",
+        "bytes": 4887996,
+        "file_modification": "2024-12-04T00:04:55.328Z",
+        "av_metadata": {
+            "format": "MPEG-4",
+            "codec_id": "M4A ",
+            "duration": 297.13,
+            "audio_count": 1,
+            "file_extension": "m4a",
+            "format_profile": "Apple audio with iTunes info"
+        },
+        "dro_file_parts": [
+            {
+                "part_type": "audio",
+                "part_id": "1",
+                "order": 0,
+                "format": "AAC",
+                "audio_metadata": {
+                    "channels": "2",
+                    "codec_id": "mp4a-40-2",
+                    "stream_size": 4831404,
+                    "sampling_rate": 48000,
+                    "mean_volume": -35,
+                    "max_volume": -25
+                }
+            }
+        ]
+    },
+    {
+        "druid": "druid:bc123df4567",
+        "filename": "file2.mp4",
+        "filetype": "fmt/199",
+        "mimetype": "video/mp4",
+        "bytes": 4887996,
+        "file_modification": "2024-12-04T00:04:55.328Z",
+        "av_metadata": {
+            "format": "MPEG-4",
+            "codec_id": "M4A ",
+            "duration": 297.13,
+            "audio_count": 1,
+            "file_extension": "m4a",
+            "format_profile": "Apple audio with iTunes info"
+        },
+        "dro_file_parts": [
+            {
+                "part_type": "audio",
+                "part_id": "1",
+                "order": 0,
+                "format": "AAC",
+                "audio_metadata": {
+                    "channels": "2",
+                    "codec_id": "mp4a-40-2",
+                    "stream_size": 4831404,
+                    "sampling_rate": 48000,
+                    "mean_volume": -35,
+                    "max_volume": -25
+                }
+            }
+        ]
+    },
+    {
+        "druid": "druid:bc123df4567",
+        "filename": "file3.mp4",
+        "filetype": "fmt/199",
+        "mimetype": "video/mp4",
+        "bytes": 4887996,
+        "file_modification": "2024-12-04T00:04:55.328Z",
+        "av_metadata": {
+            "format": "MPEG-4",
+            "codec_id": "M4A ",
+            "duration": 297.13,
+            "audio_count": 1,
+            "file_extension": "m4a",
+            "format_profile": "Apple audio with iTunes info"
+        },
+        "dro_file_parts": [
+            {
+                "part_type": "audio",
+                "part_id": "1",
+                "order": 0,
+                "format": "AAC",
+                "audio_metadata": {
+                    "channels": "2",
+                    "codec_id": "mp4a-40-2",
+                    "stream_size": 4831404,
+                    "sampling_rate": 48000,
+                    "mean_volume": -35,
+                    "max_volume": -25
+                }
+            }
+        ]
+    },
+    {
+        "druid": "druid:bc123df4567",
+        "filename": "file1.txt",
+        "filetype": "fmt/817",
+        "mimetype": "application/text",
+        "bytes": 88457,
+        "file_modification": "2024-12-04T00:13:16.502Z"
+    },
+    {
+        "druid": "druid:bc123df4567",
+        "filename": "file2.txt",
+        "filetype": "fmt/817",
+        "mimetype": "application/text",
+        "bytes": 91457,
+        "file_modification": "2024-12-04T00:13:16.502Z"
+    }
+]
diff --git a/spec/fixtures/technical_metadata/bc123df4567_no_audio_track.json b/spec/fixtures/technical_metadata/bc123df4567_no_audio_track.json
@@ -0,0 +1,144 @@
+[
+    {
+        "druid": "druid:bc123df4567",
+        "filename": "file1.m4a",
+        "filetype": "fmt/199",
+        "mimetype": "audio/mp4",
+        "bytes": 4887996,
+        "file_modification": "2024-12-04T00:04:55.328Z",
+        "av_metadata": {
+            "format": "MPEG-4",
+            "codec_id": "M4A ",
+            "duration": 0,
+            "audio_count": 0,
+            "file_extension": "m4a",
+            "format_profile": "Apple audio with iTunes info"
+        },
+        "dro_file_parts": [
+            {
+                "part_type": "audio",
+                "part_id": "1",
+                "order": 0,
+                "format": "AAC",
+                "audio_metadata": {
+                    "channels": "2",
+                    "codec_id": "mp4a-40-2",
+                    "stream_size": 4831404,
+                    "sampling_rate": 48000
+                }
+            }
+        ]
+    },
+    {
+        "druid": "druid:bc123df4567",
+        "filename": "file1.mp4",
+        "filetype": "fmt/199",
+        "mimetype": "video/mp4",
+        "bytes": 4887996,
+        "file_modification": "2024-12-04T00:04:55.328Z",
+        "av_metadata": {
+            "format": "MPEG-4",
+            "codec_id": "M4A ",
+            "duration": 297.13,
+            "audio_count": 1,
+            "file_extension": "m4a",
+            "format_profile": "Apple audio with iTunes info"
+        },
+        "dro_file_parts": [
+            {
+                "part_type": "audio",
+                "part_id": "1",
+                "order": 0,
+                "format": "AAC",
+                "audio_metadata": {
+                    "channels": "2",
+                    "codec_id": "mp4a-40-2",
+                    "stream_size": 4831404,
+                    "sampling_rate": 48000,
+                    "mean_volume": -35,
+                    "max_volume": -25
+                }
+            }
+        ]
+    },
+    {
+        "druid": "druid:bc123df4567",
+        "filename": "file2.mp4",
+        "filetype": "fmt/199",
+        "mimetype": "video/mp4",
+        "bytes": 4887996,
+        "file_modification": "2024-12-04T00:04:55.328Z",
+        "av_metadata": {
+            "format": "MPEG-4",
+            "codec_id": "M4A ",
+            "duration": 297.13,
+            "audio_count": 1,
+            "file_extension": "m4a",
+            "format_profile": "Apple audio with iTunes info"
+        },
+        "dro_file_parts": [
+            {
+                "part_type": "audio",
+                "part_id": "1",
+                "order": 0,
+                "format": "AAC",
+                "audio_metadata": {
+                    "channels": "2",
+                    "codec_id": "mp4a-40-2",
+                    "stream_size": 4831404,
+                    "sampling_rate": 48000,
+                    "mean_volume": -35,
+                    "max_volume": -25
+                }
+            }
+        ]
+    },
+    {
+        "druid": "druid:bc123df4567",
+        "filename": "file3.mp4",
+        "filetype": "fmt/199",
+        "mimetype": "video/mp4",
+        "bytes": 4887996,
+        "file_modification": "2024-12-04T00:04:55.328Z",
+        "av_metadata": {
+            "format": "MPEG-4",
+            "codec_id": "M4A ",
+            "duration": 297.13,
+            "audio_count": 1,
+            "file_extension": "m4a",
+            "format_profile": "Apple audio with iTunes info"
+        },
+        "dro_file_parts": [
+            {
+                "part_type": "audio",
+                "part_id": "1",
+                "order": 0,
+                "format": "AAC",
+                "audio_metadata": {
+                    "channels": "2",
+                    "codec_id": "mp4a-40-2",
+                    "stream_size": 4831404,
+                    "sampling_rate": 48000,
+                    "mean_volume": -35,
+                    "max_volume": -25
+                }
+            }
+        ]
+    },
+    {
+        "druid": "druid:bc123df4567",
+        "filename": "file1.txt",
+        "filetype": "fmt/817",
+        "mimetype": "application/text",
+        "bytes": 88457,
+        "file_modification": "2024-12-04T00:13:16.502Z"
+    },
+    {
+        "druid": "druid:bc123df4567",
+        "filename": "file2.txt",
+        "filetype": "fmt/817",
+        "mimetype": "application/text",
+        "bytes": 91457,
+        "file_modification": "2024-12-04T00:13:16.502Z"
+    }
+]