Revision of transcribe_google_speech_to_text.py

1

+

#!/usr/bin/env python3

2

+

# pip3 install google_cloud_storage google_cloud_speech srt

3

+

from google.cloud import speech, storage

4

+

from sys import stderr

5

+

from time import sleep

6

+

from argparse import ArgumentParser

7

+

8

+

parser = ArgumentParser()

9

+

parser.add_argument("filename")

10

+

parser.add_argument("-w", "--word_time", action="store_true", help="enable_word_time_offsets")

11

+

parser.add_argument("-p", "--punctuation", action="store_true")

12

+

parser.add_argument("-l", "--lang", default="en_US")

13

+

parser.add_argument("-r", "--sample_rate", type=int, default=44100)

14

+

parser.add_argument("--bucket", default="adslivetranscribe")

15

+

parser.add_argument("--keep_in_storage", action="store_true")

16

+

args = parser.parse_args()

17

+

if args.word_time:

18

+

import srt, datetime

19

+

20

+

# from https://medium.com/searce/generate-srt-file-subtitles-using-google-clouds-speech-to-text-api-402b2f1da3bd

21

+

def subtitle_generation(response, bin_size=3):

22

+

"""We define a bin of time period to display the words in sync with audio.

23

+

Here, bin_size = 3 means each bin is of 3 secs.

24

+

All the words in the interval of 3 secs in result will be grouped togather."""

25

+

transcriptions = []

26

+

index = 0

27

+

28

+

for result in response.results:

29

+

try:

30

+

if result.alternatives[0].words[0].start_time.seconds:

31

+

# bin start -> for first word of result

32

+

start_sec = result.alternatives[0].words[0].start_time.seconds

33

+

start_microsec = result.alternatives[0].words[0].start_time.microseconds

34

+

else:

35

+

# bin start -> For First word of response

36

+

start_sec = 0

37

+

start_microsec = 0

38

+

end_sec = start_sec + bin_size # bin end sec

39

+

40

+

# for last word of result

41

+

last_word_end_sec = result.alternatives[0].words[-1].end_time.seconds

42

+

last_word_end_microsec = result.alternatives[0].words[-1].end_time.microseconds

43

+

44

+

# bin transcript

45

+

transcript = result.alternatives[0].words[0].word

46

+

47

+

index += 1 # subtitle index

48

+

49

+

for i in range(len(result.alternatives[0].words) - 1):

50

+

try:

51

+

word = result.alternatives[0].words[i + 1].word

52

+

word_start_sec = result.alternatives[0].words[i + 1].start_time.seconds

53

+

word_start_microsec = result.alternatives[0].words[i + 1].start_time.microseconds # 0.001 to convert nana -> micro

54

+

word_end_sec = result.alternatives[0].words[i + 1].end_time.seconds

55

+

word_end_microsec = result.alternatives[0].words[i + 1].end_time.microseconds

56

+

57

+

if word_end_sec < end_sec:

58

+

transcript = transcript + " " + word

59

+

else:

60

+

previous_word_end_sec = result.alternatives[0].words[i].end_time.seconds

61

+

previous_word_end_microsec = result.alternatives[0].words[i].end_time.microseconds

62

+

63

+

# append bin transcript

64

+

transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, start_sec, start_microsec), datetime.timedelta(0, previous_word_end_sec, previous_word_end_microsec), transcript))

65

+

66

+

# reset bin parameters

67

+

start_sec = word_start_sec

68

+

start_microsec = word_start_microsec

69

+

end_sec = start_sec + bin_size

70

+

transcript = result.alternatives[0].words[i + 1].word

71

+

72

+

index += 1

73

+

except IndexError:

74

+

pass

75

+

# append transcript of last transcript in bin

76

+

transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, start_sec, start_microsec), datetime.timedelta(0, last_word_end_sec, last_word_end_microsec), transcript))

77

+

index += 1

78

+

except IndexError:

79

+

pass

80

+

81

+

# turn transcription list into subtitles

82

+

subtitles = srt.compose(transcriptions)

83

+

return subtitles

84

+

85

+

client = speech.SpeechClient()

86

+

87

+

config = speech.RecognitionConfig(encoding=speech.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED, sample_rate_hertz=args.sample_rate, language_code=args.lang, enable_automatic_punctuation=args.punctuation, enable_word_time_offsets=args.word_time)

88

+

89

+

storage_client = storage.Client()

90

+

bucket = storage_client.bucket(args.bucket)

91

+

blob = bucket.blob(args.filename)

92

+

print("uploading {}...".format(blob.name), file=stderr)

93

+

blob.upload_from_filename(blob.name)

94

+

print("done uploading, processing", file=stderr)

95

+

audio = speech.RecognitionAudio(uri="gs://{}/{}".format(args.bucket, blob.name))

96

+

97

+

operation = client.long_running_recognize(config=config, audio=audio)

98

+

99

+

x = 0

100

+

while not operation.done():

101

+

print("Waiting" + ("." * x) + "\r", end="", file=stderr)

102

+

x += 1

103

+

sleep(2)

104

+

105

+

print("", file=stderr)

106

+

response = operation.result()

107

+

108

+

if not args.word_time:

109

+

print("".join(r.alternatives[0].transcript for r in response.results))

110

+

else:

111

+

print(subtitle_generation(response))

112

+

if not args.keep_in_storage:

113

+

blob.delete()

aly / transcribe_google_speech_to_text.py

Alyssa Smith revised this gist 1603315658. Go to revision

		@@ -0,0 +1,113 @@
1	+	#!/usr/bin/env python3
2	+	# pip3 install google_cloud_storage google_cloud_speech srt
3	+	from google.cloud import speech, storage
4	+	from sys import stderr
5	+	from time import sleep
6	+	from argparse import ArgumentParser
7	+
8	+	parser = ArgumentParser()
9	+	parser.add_argument("filename")
10	+	parser.add_argument("-w", "--word_time", action="store_true", help="enable_word_time_offsets")
11	+	parser.add_argument("-p", "--punctuation", action="store_true")
12	+	parser.add_argument("-l", "--lang", default="en_US")
13	+	parser.add_argument("-r", "--sample_rate", type=int, default=44100)
14	+	parser.add_argument("--bucket", default="adslivetranscribe")
15	+	parser.add_argument("--keep_in_storage", action="store_true")
16	+	args = parser.parse_args()
17	+	if args.word_time:
18	+	import srt, datetime
19	+
20	+	# from https://medium.com/searce/generate-srt-file-subtitles-using-google-clouds-speech-to-text-api-402b2f1da3bd
21	+	def subtitle_generation(response, bin_size=3):
22	+	"""We define a bin of time period to display the words in sync with audio.
23	+	Here, bin_size = 3 means each bin is of 3 secs.
24	+	All the words in the interval of 3 secs in result will be grouped togather."""
25	+	transcriptions = []
26	+	index = 0
27	+
28	+	for result in response.results:
29	+	try:
30	+	if result.alternatives[0].words[0].start_time.seconds:
31	+	# bin start -> for first word of result
32	+	start_sec = result.alternatives[0].words[0].start_time.seconds
33	+	start_microsec = result.alternatives[0].words[0].start_time.microseconds
34	+	else:
35	+	# bin start -> For First word of response
36	+	start_sec = 0
37	+	start_microsec = 0
38	+	end_sec = start_sec + bin_size # bin end sec
39	+
40	+	# for last word of result
41	+	last_word_end_sec = result.alternatives[0].words[-1].end_time.seconds
42	+	last_word_end_microsec = result.alternatives[0].words[-1].end_time.microseconds
43	+
44	+	# bin transcript
45	+	transcript = result.alternatives[0].words[0].word
46	+
47	+	index += 1 # subtitle index
48	+
49	+	for i in range(len(result.alternatives[0].words) - 1):
50	+	try:
51	+	word = result.alternatives[0].words[i + 1].word
52	+	word_start_sec = result.alternatives[0].words[i + 1].start_time.seconds
53	+	word_start_microsec = result.alternatives[0].words[i + 1].start_time.microseconds # 0.001 to convert nana -> micro
54	+	word_end_sec = result.alternatives[0].words[i + 1].end_time.seconds
55	+	word_end_microsec = result.alternatives[0].words[i + 1].end_time.microseconds
56	+
57	+	if word_end_sec < end_sec:
58	+	transcript = transcript + " " + word
59	+	else:
60	+	previous_word_end_sec = result.alternatives[0].words[i].end_time.seconds
61	+	previous_word_end_microsec = result.alternatives[0].words[i].end_time.microseconds
62	+
63	+	# append bin transcript
64	+	transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, start_sec, start_microsec), datetime.timedelta(0, previous_word_end_sec, previous_word_end_microsec), transcript))
65	+
66	+	# reset bin parameters
67	+	start_sec = word_start_sec
68	+	start_microsec = word_start_microsec
69	+	end_sec = start_sec + bin_size
70	+	transcript = result.alternatives[0].words[i + 1].word
71	+
72	+	index += 1
73	+	except IndexError:
74	+	pass
75	+	# append transcript of last transcript in bin
76	+	transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, start_sec, start_microsec), datetime.timedelta(0, last_word_end_sec, last_word_end_microsec), transcript))
77	+	index += 1
78	+	except IndexError:
79	+	pass
80	+
81	+	# turn transcription list into subtitles
82	+	subtitles = srt.compose(transcriptions)
83	+	return subtitles
84	+
85	+	client = speech.SpeechClient()
86	+
87	+	config = speech.RecognitionConfig(encoding=speech.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED, sample_rate_hertz=args.sample_rate, language_code=args.lang, enable_automatic_punctuation=args.punctuation, enable_word_time_offsets=args.word_time)
88	+
89	+	storage_client = storage.Client()
90	+	bucket = storage_client.bucket(args.bucket)
91	+	blob = bucket.blob(args.filename)
92	+	print("uploading {}...".format(blob.name), file=stderr)
93	+	blob.upload_from_filename(blob.name)
94	+	print("done uploading, processing", file=stderr)
95	+	audio = speech.RecognitionAudio(uri="gs://{}/{}".format(args.bucket, blob.name))
96	+
97	+	operation = client.long_running_recognize(config=config, audio=audio)
98	+
99	+	x = 0
100	+	while not operation.done():
101	+	print("Waiting" + ("." * x) + "\r", end="", file=stderr)
102	+	x += 1
103	+	sleep(2)
104	+
105	+	print("", file=stderr)
106	+	response = operation.result()
107	+
108	+	if not args.word_time:
109	+	print("".join(r.alternatives[0].transcript for r in response.results))
110	+	else:
111	+	print(subtitle_generation(response))
112	+	if not args.keep_in_storage:
113	+	blob.delete()