Skip to content

Commit 061f0e3

Browse files
aashipandyakaustubh-darekar
authored andcommitted
Youtube timestamp (#877)
* youtube timestamp added to metadata * updated timestamps format while extraction * added fix for last chunk * updated default values of timestamp --------- Co-authored-by: kaustubh-darekar <[email protected]>
1 parent 656b85c commit 061f0e3

File tree

2 files changed

+16
-5
lines changed

2 files changed

+16
-5
lines changed

backend/src/chunkid_entities.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,8 @@ def process_chunk_data(chunk_data):
7474
for chunk in record["chunks"]:
7575
chunk.update(doc_properties)
7676
if chunk["fileSource"] == "youtube":
77-
chunk["start_time"] = min(time_to_seconds(chunk["start_time"]),time_to_seconds(chunk["end_time"]))
78-
chunk["end_time"] = time_to_seconds(chunk["end_time"])
77+
chunk["start_time"] = min(time_to_seconds(chunk.get('start_time',0)),time_to_seconds(chunk.get("end_time",0)))
78+
chunk["end_time"] = time_to_seconds(chunk.get("end_time",0))
7979
chunk_properties.append(chunk)
8080

8181
return chunk_properties

backend/src/document_sources/youtube.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def get_youtube_combined_transcript(youtube_id):
4242
transcript_dict = get_youtube_transcript(youtube_id)
4343
transcript=''
4444
for td in transcript_dict:
45-
transcript += ''.join(td['text'])
45+
transcript += ''.join(td['text'])+" "
4646
return transcript
4747
except Exception as e:
4848
message = f"Youtube transcript is not available for youtube Id: {youtube_id}"
@@ -83,9 +83,20 @@ def get_documents_from_youtube(url):
8383
# print(f'youtube page_content: {youtube_transcript[0].page_content}')
8484
# print(f'youtube id: {youtube_transcript[0].metadata["id"]}')
8585
# print(f'youtube title: {youtube_transcript[0].metadata["snippet"]["title"]}')
86-
transcript= get_youtube_combined_transcript(match.group(1))
86+
transcript= get_youtube_transcript(match.group(1))
87+
transcript_content=''
88+
counter = YOUTUBE_CHUNK_SIZE_SECONDS
89+
pages = []
90+
for i, td in enumerate(transcript):
91+
if td['start'] < counter:
92+
transcript_content += ''.join(td['text'])+" "
93+
else :
94+
transcript_content += ''.join(td['text'])+" "
95+
pages.append(Document(page_content=transcript_content.strip(), metadata={'start_timestamp':str(timedelta(seconds = counter-YOUTUBE_CHUNK_SIZE_SECONDS)).split('.')[0], 'end_timestamp':str(timedelta(seconds = td['start'])).split('.')[0]}))
96+
counter += YOUTUBE_CHUNK_SIZE_SECONDS
97+
transcript_content=''
98+
pages.append(Document(page_content=transcript_content.strip(), metadata={'start_timestamp':str(timedelta(seconds = counter-YOUTUBE_CHUNK_SIZE_SECONDS)).split('.')[0], 'end_timestamp':str(timedelta(seconds =transcript[-1]['start'] if transcript else counter)).split('.')[0]})) # Handle empty transcript_pieces
8799
file_name = match.group(1)#youtube_transcript[0].metadata["snippet"]["title"]
88-
pages = [Document(page_content=transcript)]
89100
return file_name, pages
90101
except Exception as e:
91102
error_message = str(e)

0 commit comments

Comments
 (0)