Skip to content

Commit 40c890e

Browse files
Roopan-MicrosoftAjitPadhi-MicrosoftPavan-Microsoftross-p-smithgpickett
authored
fix: merging dev changes to main branch (#1786)
Signed-off-by: dependabot[bot] <[email protected]> Co-authored-by: Ajit Padhi <[email protected]> Co-authored-by: Pavan-Microsoft <[email protected]> Co-authored-by: Ross Smith <[email protected]> Co-authored-by: gpickett <[email protected]> Co-authored-by: Francia Riesco <[email protected]> Co-authored-by: Francia Riesco <[email protected]> Co-authored-by: Prajwal D C <[email protected]> Co-authored-by: Harmanpreet-Microsoft <[email protected]> Co-authored-by: UtkarshMishra-Microsoft <[email protected]> Co-authored-by: Priyanka-Microsoft <[email protected]> Co-authored-by: Prasanjeet-Microsoft <[email protected]> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Kiran-Siluveru-Microsoft <[email protected]> Co-authored-by: Prashant-Microsoft <[email protected]> Co-authored-by: Rohini-Microsoft <[email protected]> Co-authored-by: Avijit-Microsoft <[email protected]> Co-authored-by: RaviKiran-Microsoft <[email protected]> Co-authored-by: Somesh Joshi <[email protected]> Co-authored-by: Himanshi Agrawal <[email protected]> Co-authored-by: pradeepjha-microsoft <[email protected]> Co-authored-by: Harmanpreet Kaur <[email protected]> Co-authored-by: Bangarraju-Microsoft <[email protected]> Co-authored-by: Harsh-Microsoft <[email protected]> Co-authored-by: Kanchan-Microsoft <[email protected]> Co-authored-by: Cristopher Coronado <[email protected]> Co-authored-by: Cristopher Coronado Moreira <[email protected]>
1 parent a60bf67 commit 40c890e

19 files changed

+2674
-2345
lines changed

.github/workflows/build-docker.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ jobs:
5656

5757
# Login for 'dev' and 'demo' branches to cwydcontainerreg only
5858
- name: Docker Login to cwydcontainerreg (Dev/Demo)
59-
if: ${{ inputs.push == true && (github.ref_name == 'dev' || github.ref_name == 'demo') }}
59+
if: ${{ inputs.push == true && (github.ref_name == 'dev' || github.ref_name == 'demo' || github.ref_name == 'dependabotchanges') }}
6060
uses: docker/login-action@v3
6161
with:
6262
registry: ${{ inputs.new_registry }}

code/backend/batch/utilities/document_chunking/chunking_strategy.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ class ChunkingStrategy(Enum):
66
PAGE = "page"
77
FIXED_SIZE_OVERLAP = "fixed_size_overlap"
88
PARAGRAPH = "paragraph"
9+
JSON = "json"
910

1011

1112
class ChunkingSettings:
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import json
2+
from typing import List
3+
from .document_chunking_base import DocumentChunkingBase
4+
from langchain.text_splitter import RecursiveJsonSplitter
5+
from .chunking_strategy import ChunkingSettings
6+
from ..common.source_document import SourceDocument
7+
8+
9+
class JSONDocumentChunking(DocumentChunkingBase):
10+
def __init__(self) -> None:
11+
pass
12+
13+
def chunk(
14+
self, documents: List[SourceDocument], chunking: ChunkingSettings
15+
) -> List[SourceDocument]:
16+
full_document_content = "".join(
17+
list(map(lambda document: str(document.content), documents))
18+
)
19+
document_url = documents[0].source
20+
json_data = json.loads(full_document_content)
21+
splitter = RecursiveJsonSplitter(max_chunk_size=chunking.chunk_size)
22+
chunked_content_list = splitter.split_json(json_data)
23+
# Create document for each chunk
24+
documents = []
25+
chunk_offset = 0
26+
for idx, chunked_content in enumerate(chunked_content_list):
27+
documents.append(
28+
SourceDocument.from_metadata(
29+
content=str(chunked_content),
30+
document_url=document_url,
31+
metadata={"offset": chunk_offset},
32+
idx=idx,
33+
)
34+
)
35+
36+
chunk_offset += len(chunked_content)
37+
return documents

code/backend/batch/utilities/document_chunking/strategies.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from .page import PageDocumentChunking
44
from .fixed_size_overlap import FixedSizeOverlapDocumentChunking
55
from .paragraph import ParagraphDocumentChunking
6+
from .json import JSONDocumentChunking
67

78

89
def get_document_chunker(chunking_strategy: str):
@@ -14,5 +15,7 @@ def get_document_chunker(chunking_strategy: str):
1415
return FixedSizeOverlapDocumentChunking()
1516
elif chunking_strategy == ChunkingStrategy.PARAGRAPH.value:
1617
return ParagraphDocumentChunking()
18+
elif chunking_strategy == ChunkingStrategy.JSON.value:
19+
return JSONDocumentChunking()
1720
else:
1821
raise Exception(f"Unknown chunking strategy: {chunking_strategy}")

code/backend/batch/utilities/helpers/config/config_helper.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ def get_available_document_types(self) -> list[str]:
6868
"jpg",
6969
"png",
7070
"docx",
71+
"json"
7172
}
7273
if self.env_helper.USE_ADVANCED_IMAGE_PROCESSING:
7374
document_types.update(ADVANCED_IMAGE_PROCESSING_FILE_TYPES)
@@ -308,10 +309,10 @@ def clear_config():
308309
@staticmethod
309310
def _append_advanced_image_processors():
310311
image_file_types = ["jpeg", "jpg", "png", "tiff", "bmp"]
311-
ConfigHelper._remove_processors_for_file_types(image_file_types)
312+
# ConfigHelper._remove_processors_for_file_types(image_file_types)
312313
ConfigHelper._default_config["document_processors"].extend(
313314
[
314-
{"document_type": file_type, "use_advanced_image_processing": True}
315+
{"document_type": file_type, "chunking" : ConfigHelper._default_config["document_processors"][0]["chunking"], "loading" : ConfigHelper._default_config["document_processors"][0]["loading"], "use_advanced_image_processing": True}
315316
for file_type in image_file_types
316317
]
317318
)

code/backend/batch/utilities/helpers/config/default.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,17 @@
9797
"strategy": "docx"
9898
}
9999
},
100+
{
101+
"document_type": "json",
102+
"chunking": {
103+
"strategy": "json",
104+
"size": 500,
105+
"overlap": 100
106+
},
107+
"loading": {
108+
"strategy": "web"
109+
}
110+
},
100111
{
101112
"document_type": "jpg",
102113
"chunking": {

0 commit comments

Comments
 (0)