Last active 1603279659

Revision 2ad2fcfaae2ba8d317e2eccee0a284ce13c29c6f

transcribe_google_speech_to_text.py Raw
1#!/usr/bin/env python3
2# pip3 install google_cloud_storage google_cloud_speech srt
3from google.cloud import speech, storage
4from sys import stderr
5from time import sleep
6from argparse import ArgumentParser
7
8parser = ArgumentParser()
9parser.add_argument("filename")
10parser.add_argument("-w", "--word_time", action="store_true", help="enable_word_time_offsets")
11parser.add_argument("-p", "--punctuation", action="store_true")
12parser.add_argument("-l", "--lang", default="en_US")
13parser.add_argument("-r", "--sample_rate", type=int, default=44100)
14parser.add_argument("--bucket", default="adslivetranscribe")
15parser.add_argument("--keep_in_storage", action="store_true")
16args = parser.parse_args()
17if args.word_time:
18 import srt, datetime
19
20# from https://medium.com/searce/generate-srt-file-subtitles-using-google-clouds-speech-to-text-api-402b2f1da3bd
21def subtitle_generation(response, bin_size=3):
22 """We define a bin of time period to display the words in sync with audio.
23 Here, bin_size = 3 means each bin is of 3 secs.
24 All the words in the interval of 3 secs in result will be grouped togather."""
25 transcriptions = []
26 index = 0
27
28 for result in response.results:
29 try:
30 if result.alternatives[0].words[0].start_time.seconds:
31 # bin start -> for first word of result
32 start_sec = result.alternatives[0].words[0].start_time.seconds
33 start_microsec = result.alternatives[0].words[0].start_time.microseconds
34 else:
35 # bin start -> For First word of response
36 start_sec = 0
37 start_microsec = 0
38 end_sec = start_sec + bin_size # bin end sec
39
40 # for last word of result
41 last_word_end_sec = result.alternatives[0].words[-1].end_time.seconds
42 last_word_end_microsec = result.alternatives[0].words[-1].end_time.microseconds
43
44 # bin transcript
45 transcript = result.alternatives[0].words[0].word
46
47 index += 1 # subtitle index
48
49 for i in range(len(result.alternatives[0].words) - 1):
50 try:
51 word = result.alternatives[0].words[i + 1].word
52 word_start_sec = result.alternatives[0].words[i + 1].start_time.seconds
53 word_start_microsec = result.alternatives[0].words[i + 1].start_time.microseconds # 0.001 to convert nana -> micro
54 word_end_sec = result.alternatives[0].words[i + 1].end_time.seconds
55 word_end_microsec = result.alternatives[0].words[i + 1].end_time.microseconds
56
57 if word_end_sec < end_sec:
58 transcript = transcript + " " + word
59 else:
60 previous_word_end_sec = result.alternatives[0].words[i].end_time.seconds
61 previous_word_end_microsec = result.alternatives[0].words[i].end_time.microseconds
62
63 # append bin transcript
64 transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, start_sec, start_microsec), datetime.timedelta(0, previous_word_end_sec, previous_word_end_microsec), transcript))
65
66 # reset bin parameters
67 start_sec = word_start_sec
68 start_microsec = word_start_microsec
69 end_sec = start_sec + bin_size
70 transcript = result.alternatives[0].words[i + 1].word
71
72 index += 1
73 except IndexError:
74 pass
75 # append transcript of last transcript in bin
76 transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, start_sec, start_microsec), datetime.timedelta(0, last_word_end_sec, last_word_end_microsec), transcript))
77 index += 1
78 except IndexError:
79 pass
80
81 # turn transcription list into subtitles
82 subtitles = srt.compose(transcriptions)
83 return subtitles
84
85client = speech.SpeechClient()
86
87config = speech.RecognitionConfig(encoding=speech.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED, sample_rate_hertz=args.sample_rate, language_code=args.lang, enable_automatic_punctuation=args.punctuation, enable_word_time_offsets=args.word_time)
88
89storage_client = storage.Client()
90bucket = storage_client.bucket(args.bucket)
91blob = bucket.blob(args.filename)
92print("uploading {}...".format(blob.name), file=stderr)
93blob.upload_from_filename(blob.name)
94print("done uploading, processing", file=stderr)
95audio = speech.RecognitionAudio(uri="gs://{}/{}".format(args.bucket, blob.name))
96
97operation = client.long_running_recognize(config=config, audio=audio)
98
99x = 0
100while not operation.done():
101 print("Waiting" + ("." * x) + "\r", end="", file=stderr)
102 x += 1
103 sleep(2)
104
105print("", file=stderr)
106response = operation.result()
107
108if not args.word_time:
109 print("".join(r.alternatives[0].transcript for r in response.results))
110else:
111 print(subtitle_generation(response))
112if not args.keep_in_storage:
113 blob.delete()