@@ -42,7 +42,7 @@ def get_youtube_combined_transcript(youtube_id):
42
42
transcript_dict = get_youtube_transcript (youtube_id )
43
43
transcript = ''
44
44
for td in transcript_dict :
45
- transcript += '' .join (td ['text' ])
45
+ transcript += '' .join (td ['text' ])+ " "
46
46
return transcript
47
47
except Exception as e :
48
48
message = f"Youtube transcript is not available for youtube Id: { youtube_id } "
@@ -83,9 +83,20 @@ def get_documents_from_youtube(url):
83
83
# print(f'youtube page_content: {youtube_transcript[0].page_content}')
84
84
# print(f'youtube id: {youtube_transcript[0].metadata["id"]}')
85
85
# print(f'youtube title: {youtube_transcript[0].metadata["snippet"]["title"]}')
86
- transcript = get_youtube_combined_transcript (match .group (1 ))
86
+ transcript = get_youtube_transcript (match .group (1 ))
87
+ transcript_content = ''
88
+ counter = YOUTUBE_CHUNK_SIZE_SECONDS
89
+ pages = []
90
+ for i , td in enumerate (transcript ):
91
+ if td ['start' ] < counter :
92
+ transcript_content += '' .join (td ['text' ])+ " "
93
+ else :
94
+ transcript_content += '' .join (td ['text' ])+ " "
95
+ pages .append (Document (page_content = transcript_content .strip (), metadata = {'start_timestamp' :str (timedelta (seconds = counter - YOUTUBE_CHUNK_SIZE_SECONDS )).split ('.' )[0 ], 'end_timestamp' :str (timedelta (seconds = td ['start' ])).split ('.' )[0 ]}))
96
+ counter += YOUTUBE_CHUNK_SIZE_SECONDS
97
+ transcript_content = ''
98
+ pages .append (Document (page_content = transcript_content .strip (), metadata = {'start_timestamp' :str (timedelta (seconds = counter - YOUTUBE_CHUNK_SIZE_SECONDS )).split ('.' )[0 ], 'end_timestamp' :str (timedelta (seconds = transcript [- 1 ]['start' ] if transcript else counter )).split ('.' )[0 ]})) # Handle empty transcript_pieces
87
99
file_name = match .group (1 )#youtube_transcript[0].metadata["snippet"]["title"]
88
- pages = [Document (page_content = transcript )]
89
100
return file_name , pages
90
101
except Exception as e :
91
102
error_message = str (e )
0 commit comments