Last active 1603279659

Alyssa Smith revised this gist 1603315658. Go to revision

1 file changed, 113 insertions

transcribe_google_speech_to_text.py(file created)

@@ -0,0 +1,113 @@
1 + #!/usr/bin/env python3
2 + # pip3 install google_cloud_storage google_cloud_speech srt
3 + from google.cloud import speech, storage
4 + from sys import stderr
5 + from time import sleep
6 + from argparse import ArgumentParser
7 +
8 + parser = ArgumentParser()
9 + parser.add_argument("filename")
10 + parser.add_argument("-w", "--word_time", action="store_true", help="enable_word_time_offsets")
11 + parser.add_argument("-p", "--punctuation", action="store_true")
12 + parser.add_argument("-l", "--lang", default="en_US")
13 + parser.add_argument("-r", "--sample_rate", type=int, default=44100)
14 + parser.add_argument("--bucket", default="adslivetranscribe")
15 + parser.add_argument("--keep_in_storage", action="store_true")
16 + args = parser.parse_args()
17 + if args.word_time:
18 + import srt, datetime
19 +
20 + # from https://medium.com/searce/generate-srt-file-subtitles-using-google-clouds-speech-to-text-api-402b2f1da3bd
21 + def subtitle_generation(response, bin_size=3):
22 + """We define a bin of time period to display the words in sync with audio.
23 + Here, bin_size = 3 means each bin is of 3 secs.
24 + All the words in the interval of 3 secs in result will be grouped togather."""
25 + transcriptions = []
26 + index = 0
27 +
28 + for result in response.results:
29 + try:
30 + if result.alternatives[0].words[0].start_time.seconds:
31 + # bin start -> for first word of result
32 + start_sec = result.alternatives[0].words[0].start_time.seconds
33 + start_microsec = result.alternatives[0].words[0].start_time.microseconds
34 + else:
35 + # bin start -> For First word of response
36 + start_sec = 0
37 + start_microsec = 0
38 + end_sec = start_sec + bin_size # bin end sec
39 +
40 + # for last word of result
41 + last_word_end_sec = result.alternatives[0].words[-1].end_time.seconds
42 + last_word_end_microsec = result.alternatives[0].words[-1].end_time.microseconds
43 +
44 + # bin transcript
45 + transcript = result.alternatives[0].words[0].word
46 +
47 + index += 1 # subtitle index
48 +
49 + for i in range(len(result.alternatives[0].words) - 1):
50 + try:
51 + word = result.alternatives[0].words[i + 1].word
52 + word_start_sec = result.alternatives[0].words[i + 1].start_time.seconds
53 + word_start_microsec = result.alternatives[0].words[i + 1].start_time.microseconds # 0.001 to convert nana -> micro
54 + word_end_sec = result.alternatives[0].words[i + 1].end_time.seconds
55 + word_end_microsec = result.alternatives[0].words[i + 1].end_time.microseconds
56 +
57 + if word_end_sec < end_sec:
58 + transcript = transcript + " " + word
59 + else:
60 + previous_word_end_sec = result.alternatives[0].words[i].end_time.seconds
61 + previous_word_end_microsec = result.alternatives[0].words[i].end_time.microseconds
62 +
63 + # append bin transcript
64 + transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, start_sec, start_microsec), datetime.timedelta(0, previous_word_end_sec, previous_word_end_microsec), transcript))
65 +
66 + # reset bin parameters
67 + start_sec = word_start_sec
68 + start_microsec = word_start_microsec
69 + end_sec = start_sec + bin_size
70 + transcript = result.alternatives[0].words[i + 1].word
71 +
72 + index += 1
73 + except IndexError:
74 + pass
75 + # append transcript of last transcript in bin
76 + transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, start_sec, start_microsec), datetime.timedelta(0, last_word_end_sec, last_word_end_microsec), transcript))
77 + index += 1
78 + except IndexError:
79 + pass
80 +
81 + # turn transcription list into subtitles
82 + subtitles = srt.compose(transcriptions)
83 + return subtitles
84 +
85 + client = speech.SpeechClient()
86 +
87 + config = speech.RecognitionConfig(encoding=speech.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED, sample_rate_hertz=args.sample_rate, language_code=args.lang, enable_automatic_punctuation=args.punctuation, enable_word_time_offsets=args.word_time)
88 +
89 + storage_client = storage.Client()
90 + bucket = storage_client.bucket(args.bucket)
91 + blob = bucket.blob(args.filename)
92 + print("uploading {}...".format(blob.name), file=stderr)
93 + blob.upload_from_filename(blob.name)
94 + print("done uploading, processing", file=stderr)
95 + audio = speech.RecognitionAudio(uri="gs://{}/{}".format(args.bucket, blob.name))
96 +
97 + operation = client.long_running_recognize(config=config, audio=audio)
98 +
99 + x = 0
100 + while not operation.done():
101 + print("Waiting" + ("." * x) + "\r", end="", file=stderr)
102 + x += 1
103 + sleep(2)
104 +
105 + print("", file=stderr)
106 + response = operation.result()
107 +
108 + if not args.word_time:
109 + print("".join(r.alternatives[0].transcript for r in response.results))
110 + else:
111 + print(subtitle_generation(response))
112 + if not args.keep_in_storage:
113 + blob.delete()
Newer Older