transcribe_google_speech_to_text.py

Revision 2ad2fcfaae2ba8d317e2eccee0a284ce13c29c6f

transcribe_google_speech_to_text.py · 4.9 KiB · Python Raw

#!/usr/bin/env python3 # pip3 install google_cloud_storage google_cloud_speech srt from google.cloud import speech, storage from sys import stderr from time import sleep from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("filename") parser.add_argument("-w", "--word_time", action="store_true", help="enable_word_time_offsets") parser.add_argument("-p", "--punctuation", action="store_true") parser.add_argument("-l", "--lang", default="en_US") parser.add_argument("-r", "--sample_rate", type=int, default=44100) parser.add_argument("--bucket", default="adslivetranscribe") parser.add_argument("--keep_in_storage", action="store_true") args = parser.parse_args() if args.word_time: import srt, datetime # from https://medium.com/searce/generate-srt-file-subtitles-using-google-clouds-speech-to-text-api-402b2f1da3bd def subtitle_generation(response, bin_size=3): """We define a bin of time period to display the words in sync with audio. Here, bin_size = 3 means each bin is of 3 secs. All the words in the interval of 3 secs in result will be grouped togather.""" transcriptions = [] index = 0 for result in response.results: try: if result.alternatives[0].words[0].start_time.seconds: # bin start -> for first word of result start_sec = result.alternatives[0].words[0].start_time.seconds start_microsec = result.alternatives[0].words[0].start_time.microseconds else: # bin start -> For First word of response start_sec = 0 start_microsec = 0 end_sec = start_sec + bin_size # bin end sec # for last word of result last_word_end_sec = result.alternatives[0].words[-1].end_time.seconds last_word_end_microsec = result.alternatives[0].words[-1].end_time.microseconds # bin transcript transcript = result.alternatives[0].words[0].word index += 1 # subtitle index for i in range(len(result.alternatives[0].words) - 1): try: word = result.alternatives[0].words[i + 1].word word_start_sec = result.alternatives[0].words[i + 1].start_time.seconds word_start_microsec = result.alternatives[0].words[i + 1].start_time.microseconds # 0.001 to convert nana -> micro word_end_sec = result.alternatives[0].words[i + 1].end_time.seconds word_end_microsec = result.alternatives[0].words[i + 1].end_time.microseconds if word_end_sec < end_sec: transcript = transcript + " " + word else: previous_word_end_sec = result.alternatives[0].words[i].end_time.seconds previous_word_end_microsec = result.alternatives[0].words[i].end_time.microseconds # append bin transcript transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, start_sec, start_microsec), datetime.timedelta(0, previous_word_end_sec, previous_word_end_microsec), transcript)) # reset bin parameters start_sec = word_start_sec start_microsec = word_start_microsec end_sec = start_sec + bin_size transcript = result.alternatives[0].words[i + 1].word index += 1 except IndexError: pass # append transcript of last transcript in bin transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, start_sec, start_microsec), datetime.timedelta(0, last_word_end_sec, last_word_end_microsec), transcript)) index += 1 except IndexError: pass # turn transcription list into subtitles subtitles = srt.compose(transcriptions) return subtitles client = speech.SpeechClient() config = speech.RecognitionConfig(encoding=speech.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED, sample_rate_hertz=args.sample_rate, language_code=args.lang, enable_automatic_punctuation=args.punctuation, enable_word_time_offsets=args.word_time) storage_client = storage.Client() bucket = storage_client.bucket(args.bucket) blob = bucket.blob(args.filename) print("uploading {}...".format(blob.name), file=stderr) blob.upload_from_filename(blob.name) print("done uploading, processing", file=stderr) audio = speech.RecognitionAudio(uri="gs://{}/{}".format(args.bucket, blob.name)) operation = client.long_running_recognize(config=config, audio=audio) x = 0 while not operation.done(): print("Waiting" + ("." * x) + "\r", end="", file=stderr) x += 1 sleep(2) print("", file=stderr) response = operation.result() if not args.word_time: print("".join(r.alternatives[0].transcript for r in response.results)) else: print(subtitle_generation(response)) if not args.keep_in_storage: blob.delete()

1	#!/usr/bin/env python3
2	# pip3 install google_cloud_storage google_cloud_speech srt
3	from google.cloud import speech, storage
4	from sys import stderr
5	from time import sleep
6	from argparse import ArgumentParser
7
8	parser = ArgumentParser()
9	parser.add_argument("filename")
10	parser.add_argument("-w", "--word_time", action="store_true", help="enable_word_time_offsets")
11	parser.add_argument("-p", "--punctuation", action="store_true")
12	parser.add_argument("-l", "--lang", default="en_US")
13	parser.add_argument("-r", "--sample_rate", type=int, default=44100)
14	parser.add_argument("--bucket", default="adslivetranscribe")
15	parser.add_argument("--keep_in_storage", action="store_true")
16	args = parser.parse_args()
17	if args.word_time:
18	import srt, datetime
19
20	# from https://medium.com/searce/generate-srt-file-subtitles-using-google-clouds-speech-to-text-api-402b2f1da3bd
21	def subtitle_generation(response, bin_size=3):
22	"""We define a bin of time period to display the words in sync with audio.
23	Here, bin_size = 3 means each bin is of 3 secs.
24	All the words in the interval of 3 secs in result will be grouped togather."""
25	transcriptions = []
26	index = 0
27
28	for result in response.results:
29	try:
30	if result.alternatives[0].words[0].start_time.seconds:
31	# bin start -> for first word of result
32	start_sec = result.alternatives[0].words[0].start_time.seconds
33	start_microsec = result.alternatives[0].words[0].start_time.microseconds
34	else:
35	# bin start -> For First word of response
36	start_sec = 0
37	start_microsec = 0
38	end_sec = start_sec + bin_size # bin end sec
39
40	# for last word of result
41	last_word_end_sec = result.alternatives[0].words[-1].end_time.seconds
42	last_word_end_microsec = result.alternatives[0].words[-1].end_time.microseconds
43
44	# bin transcript
45	transcript = result.alternatives[0].words[0].word
46
47	index += 1 # subtitle index
48
49	for i in range(len(result.alternatives[0].words) - 1):
50	try:
51	word = result.alternatives[0].words[i + 1].word
52	word_start_sec = result.alternatives[0].words[i + 1].start_time.seconds
53	word_start_microsec = result.alternatives[0].words[i + 1].start_time.microseconds # 0.001 to convert nana -> micro
54	word_end_sec = result.alternatives[0].words[i + 1].end_time.seconds
55	word_end_microsec = result.alternatives[0].words[i + 1].end_time.microseconds
56
57	if word_end_sec < end_sec:
58	transcript = transcript + " " + word
59	else:
60	previous_word_end_sec = result.alternatives[0].words[i].end_time.seconds
61	previous_word_end_microsec = result.alternatives[0].words[i].end_time.microseconds
62
63	# append bin transcript
64	transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, start_sec, start_microsec), datetime.timedelta(0, previous_word_end_sec, previous_word_end_microsec), transcript))
65
66	# reset bin parameters
67	start_sec = word_start_sec
68	start_microsec = word_start_microsec
69	end_sec = start_sec + bin_size
70	transcript = result.alternatives[0].words[i + 1].word
71
72	index += 1
73	except IndexError:
74	pass
75	# append transcript of last transcript in bin
76	transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, start_sec, start_microsec), datetime.timedelta(0, last_word_end_sec, last_word_end_microsec), transcript))
77	index += 1
78	except IndexError:
79	pass
80
81	# turn transcription list into subtitles
82	subtitles = srt.compose(transcriptions)
83	return subtitles
84
85	client = speech.SpeechClient()
86
87	config = speech.RecognitionConfig(encoding=speech.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED, sample_rate_hertz=args.sample_rate, language_code=args.lang, enable_automatic_punctuation=args.punctuation, enable_word_time_offsets=args.word_time)
88
89	storage_client = storage.Client()
90	bucket = storage_client.bucket(args.bucket)
91	blob = bucket.blob(args.filename)
92	print("uploading {}...".format(blob.name), file=stderr)
93	blob.upload_from_filename(blob.name)
94	print("done uploading, processing", file=stderr)
95	audio = speech.RecognitionAudio(uri="gs://{}/{}".format(args.bucket, blob.name))
96
97	operation = client.long_running_recognize(config=config, audio=audio)
98
99	x = 0
100	while not operation.done():
101	print("Waiting" + ("." * x) + "\r", end="", file=stderr)
102	x += 1
103	sleep(2)
104
105	print("", file=stderr)
106	response = operation.result()
107
108	if not args.word_time:
109	print("".join(r.alternatives[0].transcript for r in response.results))
110	else:
111	print(subtitle_generation(response))
112	if not args.keep_in_storage:
113	blob.delete()