Skip to content

Commit 80dee28

Browse files
authored
Requirments to olap (#14759)
1 parent d35b323 commit 80dee28

20 files changed

+1724
-0
lines changed

ydb/requirements/collect_reqs.py

Lines changed: 305 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,305 @@
1+
import os
2+
import re
3+
import requests
4+
5+
GITHUB_API_URL = "https://api.github.com"
6+
GITHUB_GRAPHQL_URL = "https://api.github.com/graphql"
7+
8+
def parse_requirements(file_path, github_token):
9+
requirements = []
10+
current_req = None
11+
current_section = ""
12+
current_subsection = ""
13+
14+
with open(file_path, 'r', encoding='utf-8') as file:
15+
lines = file.readlines()
16+
17+
for line in lines:
18+
# Detect section headings
19+
section_match = re.match(r"^##\s(.+)", line)
20+
if section_match:
21+
current_section = section_match.group(1)
22+
continue
23+
24+
subsection_match = re.match(r"^###\s(.+)", line)
25+
if subsection_match:
26+
current_subsection = subsection_match.group(1)
27+
continue
28+
29+
# Identify a GitHub issue
30+
issue_match = re.match(r"- #(\d+)", line)
31+
if issue_match:
32+
issue_number = issue_match.group(1)
33+
issue_data = fetch_github_issue(issue_number, github_token)
34+
if issue_data:
35+
if current_req:
36+
requirements.append(current_req)
37+
issue_id = issue_data.get('node_id')
38+
sub_issues = fetch_sub_issues_by_id(issue_id, github_token) if issue_id else []
39+
if issue_data.get('sub_issues_summary'):
40+
percent_completed = issue_data['sub_issues_summary']['percent_completed']
41+
total = issue_data['sub_issues_summary']['total']
42+
completed = issue_data['sub_issues_summary']['completed']
43+
if issue_data['sub_issues_summary']['percent_completed'] == 100:
44+
status = 'DONE'
45+
color = f'rgb(249%2C%20239%2C%20254%2C1)'
46+
elif 0 < issue_data['sub_issues_summary']['percent_completed'] < 100 :
47+
status = 'PROGRESS'
48+
color = f'rgb(254%2C%20248%2C%20202%2C1)'
49+
else:
50+
status = 'TO%20DO'
51+
color = f'rgb(224%2C%20250%2C%20227%2C1)'
52+
issue_data['badge'] = f"![{status}](https://img.shields.io/badge/{status}-{completed}%2F{total}:{percent_completed}%25-{color}?style=for-the-badge&logo=database&labelColor=grey)"
53+
current_req = {
54+
'id': f"ISSUE-{issue_number}",
55+
'title': issue_data['title'], # Title of the issue
56+
'description': issue_data['body'],
57+
'url': issue_data['html_url'],
58+
'body': issue_data['body'],
59+
'cases': sub_issues, # Sub-issues as cases
60+
'section': current_section,
61+
'subsection': current_subsection,
62+
'sub_issues_summary': issue_data.get('sub_issues_summary'),
63+
'badge': issue_data.get('badge')
64+
}
65+
continue
66+
67+
# Identify a new requirement
68+
req_match = re.match(r"- \*\*(REQ-[A-Z]+-\d+)\*\*: (.+)", line)
69+
if req_match:
70+
if current_req:
71+
requirements.append(current_req)
72+
current_req = {
73+
'id': req_match.group(1),
74+
'title': req_match.group(2),
75+
'cases': [],
76+
'issues': [],
77+
'section': current_section,
78+
'subsection': current_subsection
79+
}
80+
# Identify requirement description
81+
# - **Description**:
82+
req_description_match = re.match(r"\s+- \*\*Description\*\*: (.+)", line)
83+
if req_description_match:
84+
current_req['description'] = req_description_match.group(1)
85+
86+
# Identify requirement issues
87+
issue_match = re.match(r"\s+- ISSUE:(.+):(.+)", line)
88+
if issue_match and current_req:
89+
issue_id = issue_match.group(2).split('/')[-1]
90+
issue_desc = issue_match.group(1)
91+
current_req['issues'].append({
92+
'id': issue_id,
93+
'description': issue_desc,
94+
'bage': f"[![GitHub issue/pull request detail](https://img.shields.io/github/issues/detail/state/ydb-platform/ydb/{issue_id})](https://github.com/ydb-platform/ydb/issues/{issue_id})" })
95+
96+
# Identify cases with optional paths
97+
case_match = re.match(r"\s+- Case (\d+\.\d+): \[(.+)\]\((.+)\) - (.+)", line)
98+
if case_match and current_req:
99+
current_case = {
100+
'case_id': f"{current_req['id']}-{case_match.group(1)}",
101+
'name': case_match.group(2),
102+
'description': case_match.group(4),
103+
'path': case_match.group(3),
104+
'issues': [],
105+
'status': "Pending"
106+
}
107+
current_req['cases'].append(current_case)
108+
109+
# Identify case issues
110+
case_issue_match = re.match(r"\s{6}- ISSUE:(.+):(.+)", line)
111+
if case_issue_match and current_case:
112+
issue_id = issue_match.group(2).split('/')[-1]
113+
issue_desc = issue_match.group(1)
114+
current_req['issues'].append({
115+
'id': issue_id,
116+
'description': issue_desc,
117+
'bage': f"[![GitHub issue/pull request detail](https://img.shields.io/github/issues/detail/state/ydb-platform/ydb/{issue_id})](https://github.com/ydb-platform/ydb/issues/{issue_id})" })
118+
119+
120+
if current_req:
121+
requirements.append(current_req)
122+
123+
return requirements
124+
125+
def fetch_github_issue(issue_number, github_token):
126+
headers = {"Authorization": f"token {github_token}"}
127+
response = requests.get(f"{GITHUB_API_URL}/repos/ydb-platform/ydb/issues/{issue_number}", headers=headers)
128+
129+
if response.status_code == 200:
130+
return response.json()
131+
else:
132+
print(f"Failed to fetch issue #{issue_number}: {response.status_code} {response.text}")
133+
return None
134+
135+
def fetch_sub_issues_by_id(issue_id, github_token):
136+
query = """
137+
query($issueId: ID!, $after: String) {
138+
node(id: $issueId) {
139+
... on Issue {
140+
subIssues(first: 100, after: $after) {
141+
nodes {
142+
title
143+
number
144+
url
145+
id
146+
body
147+
}
148+
pageInfo {
149+
hasNextPage
150+
endCursor
151+
}
152+
}
153+
}
154+
}
155+
}
156+
"""
157+
158+
variables = {
159+
"issueId": issue_id,
160+
"after": None
161+
}
162+
163+
headers = {
164+
"Authorization": f"Bearer {github_token}",
165+
"GraphQL-Features": "sub_issues"
166+
}
167+
168+
sub_issues = []
169+
170+
while True:
171+
response = requests.post(GITHUB_GRAPHQL_URL, json={"query": query, "variables": variables}, headers=headers)
172+
173+
if response.status_code == 200:
174+
data = response.json()
175+
sub_issues_data = data['data']['node']['subIssues']
176+
nodes = sub_issues_data['nodes']
177+
for node in nodes:
178+
sub_issues.append({
179+
'case_id': f"#{node['number']}",
180+
'name': node['title'],
181+
'description': node['body'].split('\n')[0],
182+
'path': node['url'],
183+
'issue': node['number'],
184+
'status': "Pending",
185+
'bage': f"[![GitHub issue/pull request detail](https://img.shields.io/github/issues/detail/state/ydb-platform/ydb/{node['number']})](https://github.com/ydb-platform/ydb/issues/{node['number']})"
186+
})
187+
188+
if not sub_issues_data['pageInfo']['hasNextPage']:
189+
break
190+
variables['after'] = sub_issues_data['pageInfo']['endCursor']
191+
else:
192+
print(f"GraphQL query failed: {response.status_code} {response.text}")
193+
break
194+
195+
return sub_issues
196+
197+
def to_anchor(s):
198+
return '#' + re.sub(r'[\s/:()]+', '-', s.lower().replace('/', '').replace('+', '')).strip('-')
199+
200+
201+
def generate_traceability_matrix(requirements, output_path):
202+
with open(output_path, 'w', encoding='utf-8') as file:
203+
file.write("# Traceability Matrix\n\n")
204+
section = ''
205+
subsection = ''
206+
for req in requirements:
207+
if section != req['section']:
208+
file.write(f"## {req['section']}\n\n")
209+
section = req['section']
210+
if subsection != req['subsection']:
211+
file.write(f"### {req['subsection']}\n")
212+
subsection = req['subsection']
213+
214+
if req.get('url'):
215+
file.write(f"#### [{req['id']}]({req['url']}): {req['title']}\n")
216+
else:
217+
file.write(f"#### {req['id']}: {req['title']}\n")
218+
if req.get('badge'):
219+
linq = to_anchor(f"{req['id']}: {req['title']}")
220+
file.write(f"[{req['badge']}](./summary.md{linq})\n\n")
221+
if req['description']:
222+
file.write(f"**Description**: {req['description']}\n\n")
223+
if req.get('issues'):
224+
file.write("Issues:\n")
225+
for issue in req['issues']:
226+
file.write(f"- {issue['id']}: {issue['description']}\n")
227+
file.write("\n")
228+
229+
file.write("| Case ID | Name | Description | Issues | Status |\n")
230+
file.write("|---------|------|-------------|--------|:--------|\n")
231+
232+
for case in req['cases']:
233+
issues_list = ""
234+
if case.get('bage'):
235+
issues_list = case['bage']
236+
if case.get('issues'):
237+
issues_list = issues_list + ','.join([f"{issue['bage']}" for issue in case['issues']])
238+
if req.get('issues'):
239+
issues_list = issues_list + ','.join([f"{issue['bage']}" for issue in req['issues']] or req['issues'])
240+
file.write(f"| {case['case_id']} | {case['name']} | {case['description']} | {issues_list} | {case['status']} |\n")
241+
file.write("\n")
242+
243+
def generate_summary(requirements, output_path):
244+
with open(output_path, 'w', encoding='utf-8') as file:
245+
file.write("# Summary\n\n")
246+
section = ''
247+
subsection = ''
248+
total = 0
249+
completed = 0
250+
for req in requirements:
251+
if req.get('sub_issues_summary'):
252+
total += req['sub_issues_summary']['total']
253+
completed += req['sub_issues_summary']['completed']
254+
file.write(f"**Completed tests: {completed}/{total}: {round(completed*100/total,2) if total > 0 else 0 }%**\n\n")
255+
file.write(f"## {req['section']}\n\n")
256+
for req in requirements:
257+
if req.get('sub_issues_summary'):
258+
if section != req['section']:
259+
file.write(f"## {req['section']}\n\n")
260+
section = req['section']
261+
if subsection != req['subsection']:
262+
file.write(f"### {req['subsection']}\n")
263+
subsection = req['subsection']
264+
265+
if req.get('url'):
266+
file.write(f"#### [{req['id']}]({req['url']}): {req['title']}\n")
267+
else:
268+
file.write(f"#### {req['id']}: {req['title']}\n")
269+
if req['description']:
270+
file.write(f"**Description**: {req['description']}\n\n")
271+
if req.get('badge'):
272+
linq = to_anchor(f"{req['id']}: {req['title']}")
273+
file.write(f"[{req['badge']}](./traceability_matrix.md{linq})\n\n")
274+
if req.get('issues'):
275+
file.write("Issues:\n")
276+
for issue in req['issues']:
277+
file.write(f"- {issue['id']}: {issue['description']}\n")
278+
file.write("\n")
279+
280+
def collect_requirements_from_directory(directory, github_token):
281+
requirements = []
282+
for root, _, files in os.walk(directory):
283+
for file in files:
284+
if file.startswith('req') and file.endswith('.md'):
285+
file_path = os.path.join(root, file)
286+
requirements.extend(parse_requirements(file_path, github_token))
287+
return requirements
288+
289+
def process_and_generate_matrices(base_directory, github_token):
290+
for root, subdirs, files in os.walk(base_directory):
291+
# Collect requirements from the current directory and its direct subdirectories
292+
requirements = collect_requirements_from_directory(root, github_token)
293+
294+
if requirements:
295+
matrix_output_file = os.path.join(root, 'traceability_matrix.md')
296+
summary_output_file = os.path.join(root, 'summary.md')
297+
generate_traceability_matrix(requirements, matrix_output_file)
298+
print(f"Generated traceability matrix in {matrix_output_file}")
299+
generate_summary(requirements, summary_output_file)
300+
print(f"Generated summary in {summary_output_file}")
301+
302+
if __name__ == "__main__":
303+
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") # You need to set this environment variable with your GitHub token
304+
current_directory = os.path.dirname(os.path.abspath(__file__))
305+
process_and_generate_matrices(current_directory, GITHUB_TOKEN)
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Requirements for YDB Analytics System
2+
3+
## Introduction
4+
This document outlines the detailed functional and non-functional requirements for the YDB analytics system, including associated test cases for verification, focusing on aggregate functions and complex analytical queries.
5+
6+
## Non-functional Requirements
7+
8+
### Performance
9+
10+
- **REQ-PERF-001**: Ensure the system handles aggregate functions efficiently across various data sizes.
11+
- **Description**: Verify that aggregate functions maintain performance standards at increasing data volumes, ensuring response times are within acceptable limits.
12+
- **Cases**:
13+
- Case 1.1: [COUNT Function Performance - 1GB](path/to/test/count_1gb) - Validate performance with a dataset of 1GB.
14+
- Case 1.2: [COUNT Function Performance - 10GB](path/to/test/count_10gb) - Validate performance with a dataset of 10GB.
15+
- Case 1.3: [COUNT Function Performance - 100GB](path/to/test/count_100gb) - Validate performance with a dataset of 100GB.
16+
- Case 1.4: [COUNT Function Performance - 1TB](path/to/test/count_1tb) - Validate performance with a dataset of 1TB.
17+
- Case 1.5: [COUNT Function Performance - 10TB](path/to/test/count_10tb) - Validate performance with a dataset of 10TB.
18+
19+
- **REQ-PERF-002**: Ensure system can efficiently compute distinct counts at scale.
20+
- **Description**: Evaluate the ability to perform COUNT(DISTINCT) operations with acceptable overhead across increasing data volumes.
21+
- **Cases**:
22+
- Case 2.1: [COUNT DISTINCT Performance - 1GB](path/to/test/count_distinct_1gb) - Measure distinct count efficiency at 1GB.
23+
- Case 2.2: [COUNT DISTINCT Performance - 10GB](path/to/test/count_distinct_10gb) - Measure distinct count efficiency at 10GB.
24+
25+
- **REQ-PERF-003**: Validate efficiency of SUM operations over large datasets.
26+
- **Description**: Ensure SUM functions execute with optimal performance metrics at different data scales.
27+
- **Cases**:
28+
- Case 3.1: [SUM Function Performance - 1GB](path/to/test/sum_1gb) - Validate SUM operation efficiency with 1GB of data.
29+
- Case 3.2: [SUM Function Performance - 10GB](path/to/test/sum_10gb) - Validate SUM operation efficiency with 10GB of data.
30+
31+
- **REQ-PERF-004**: Ensure system maintains average calculation efficiency.
32+
- **Description**: Verify AVG functions sustain performance as data sizes increase.
33+
- **Cases**:
34+
- Case 4.1: [AVG Function Performance - 1GB](path/to/test/avg_1gb) - Performance metrics for AVG operation on 1GB of data.
35+
36+
- **REQ-PERF-005**: Efficient computation of MIN/MAX operations.
37+
- **Description**: Confirm that minimum and maximum functions perform within the expected time frames across various datasets.
38+
- **Cases**:
39+
- Case 5.1: [MIN/MAX Performance - 1GB](path/to/test/min_max_1gb) - Validate performance of MIN/MAX operations with 1GB.
40+
41+
- **REQ-PERF-006**: TPC-H benchmark testing on scalability.
42+
- **Description**: Evaluate system performance using TPC-H benchmark tests at different dataset volumes.
43+
- **Cases**:
44+
- Case 6.1: [TPC-H Performance - 10GB](path/to/test/tpch_10gb) - Validate TPC-H benchmark performance with 10GB.
45+
46+
- **REQ-PERF-007**: ClickBench benchmark to test efficiency under different conditions.
47+
- **Description**: Assess system capabilities using ClickBench, targeting different data sizes.
48+
- **Cases**:
49+
- Case 7.1: [ClickBench Performance - 1GB](path/to/test/clickbench_1gb) - Evaluate with ClickBench on 1GB of data.
50+
51+
These requirements provide a framework for measuring and ensuring performance across key analytic functionalities within the YDB analytics system, with specific focus on scalability and efficiency.
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Summary
2+
3+
**Completed tests: 0/0: 0%**
4+
5+
## Non-functional Requirements
6+

0 commit comments

Comments
 (0)