Skip to content

Commit fa1bb74

Browse files
authored
ref(similarity-embeddings): Add hash column, defer stacktrace hash column (#625)
Add hash column with default value Accept hash and return parent_hash in similarity api Defer un-used stacktrace_hash column
1 parent 506aae6 commit fa1bb74

File tree

6 files changed

+129
-123
lines changed

6 files changed

+129
-123
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
"""Migration
2+
3+
Revision ID: 7278a2303b10
4+
Revises: 34eef02b2555
5+
Create Date: 2024-05-02 15:56:39.243317
6+
7+
"""
8+
import sqlalchemy as sa
9+
from alembic import op
10+
11+
# revision identifiers, used by Alembic.
12+
revision = "7278a2303b10"
13+
down_revision = "34eef02b2555"
14+
branch_labels = None
15+
depends_on = None
16+
17+
18+
def upgrade():
19+
# ### commands auto generated by Alembic - please adjust! ###
20+
with op.batch_alter_table("grouping_records", schema=None) as batch_op:
21+
batch_op.add_column(
22+
sa.Column(
23+
"hash",
24+
sa.String(length=32),
25+
server_default="00000000000000000000000000000000",
26+
nullable=False,
27+
)
28+
)
29+
30+
# ### end Alembic commands ###
31+
32+
33+
def downgrade():
34+
# ### commands auto generated by Alembic - please adjust! ###
35+
with op.batch_alter_table("grouping_records", schema=None) as batch_op:
36+
batch_op.drop_column("hash")
37+
38+
# ### end Alembic commands ###

src/seer/db.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
)
2424
from sqlalchemy.dialects.postgresql import insert
2525
from sqlalchemy.ext.asyncio import async_sessionmaker
26-
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, sessionmaker
26+
from sqlalchemy.orm import DeclarativeBase, Mapped, deferred, mapped_column, sessionmaker
2727

2828

2929
class Base(DeclarativeBase):
@@ -229,7 +229,12 @@ class DbGroupingRecord(Base):
229229
project_id: Mapped[int] = mapped_column(BigInteger, nullable=False)
230230
message: Mapped[str] = mapped_column(String, nullable=False)
231231
stacktrace_embedding: Mapped[Vector] = mapped_column(Vector(768), nullable=False)
232-
stacktrace_hash: Mapped[Optional[str]] = mapped_column(String(32), nullable=True)
232+
stacktrace_hash: Mapped[Optional[str]] = deferred(
233+
mapped_column(String(32).evaluates_none(), nullable=True)
234+
)
235+
hash: Mapped[str] = mapped_column(
236+
String(32), nullable=False, default="00000000000000000000000000000000"
237+
)
233238

234239
__table_args__ = (
235240
Index(

src/seer/grouping/grouping.py

+21-44
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import torch
99
from pydantic import BaseModel, ValidationInfo, field_validator
1010
from sentence_transformers import SentenceTransformer
11+
from sqlalchemy import or_
1112

1213
from seer.db import DbGroupingRecord, Session
1314

@@ -18,8 +19,8 @@ class GroupingRequest(BaseModel):
1819
project_id: int
1920
stacktrace: str
2021
message: str
21-
group_id: int | None = None
22-
stacktrace_hash: str | None = None
22+
hash: str
23+
group_id: Optional[int] = None
2324
k: int = 1
2425
threshold: float = 0.01
2526

@@ -32,19 +33,19 @@ def check_field_is_not_empty(cls, v, info: ValidationInfo):
3233

3334

3435
class GroupingRecord(BaseModel):
35-
group_id: int | None
36+
group_id: Optional[int]
3637
project_id: int
3738
message: str
3839
stacktrace_embedding: np.ndarray
39-
stacktrace_hash: str | None
40+
hash: str
4041

4142
def to_db_model(self) -> DbGroupingRecord:
4243
return DbGroupingRecord(
4344
group_id=self.group_id,
4445
project_id=self.project_id,
4546
message=self.message,
4647
stacktrace_embedding=self.stacktrace_embedding,
47-
stacktrace_hash=self.stacktrace_hash,
48+
hash=self.hash,
4849
)
4950

5051
class Config:
@@ -56,14 +57,14 @@ class Config:
5657

5758
class GroupingResponse(BaseModel):
5859
parent_group_id: Optional[int]
60+
parent_hash: str
5961
stacktrace_distance: float
6062
message_distance: float
6163
should_group: bool
6264

6365

6466
class SimilarityResponse(BaseModel):
6567
responses: List[GroupingResponse]
66-
token: Optional[int]
6768

6869

6970
class SimilarityBenchmarkResponse(BaseModel):
@@ -149,25 +150,6 @@ def get_nearest_neighbors(self, issue: GroupingRequest) -> SimilarityResponse:
149150
stacktrace similarity scores, message similarity scores, and grouping flags.
150151
"""
151152
with Session() as session:
152-
# If an exact match of the stacktrace hash is found, return this record
153-
if hasattr(issue, "stacktrace_hash") and issue.stacktrace_hash:
154-
existing_record = (
155-
session.query(DbGroupingRecord)
156-
.filter_by(stacktrace_hash=issue.stacktrace_hash)
157-
.first()
158-
)
159-
if existing_record:
160-
similarity_response = SimilarityResponse(responses=[], token=None)
161-
similarity_response.responses.append(
162-
GroupingResponse(
163-
parent_group_id=existing_record.group_id,
164-
stacktrace_distance=0.00,
165-
message_distance=0.00,
166-
should_group=True,
167-
)
168-
)
169-
return similarity_response
170-
171153
embedding = self.encode_text(issue.stacktrace).astype("float32")
172154

173155
results = (
@@ -180,22 +162,24 @@ def get_nearest_neighbors(self, issue: GroupingRequest) -> SimilarityResponse:
180162
.filter(
181163
DbGroupingRecord.project_id == issue.project_id,
182164
DbGroupingRecord.stacktrace_embedding.cosine_distance(embedding) <= 0.15,
183-
DbGroupingRecord.group_id != issue.group_id,
184-
DbGroupingRecord.group_id != None,
165+
or_(
166+
DbGroupingRecord.group_id != issue.group_id,
167+
DbGroupingRecord.group_id == None,
168+
),
169+
# TODO We can return a group as similar group to itself if it exists in the old table with no hash
170+
DbGroupingRecord.hash != issue.hash,
185171
)
186172
.order_by("distance")
187173
.limit(issue.k)
188174
.all()
189175
)
190176

191177
# If no existing groups within the threshold, insert the request as a new GroupingRecord
192-
token = None
193178
if not any(distance <= issue.threshold for _, distance in results):
194-
token = self.insert_new_grouping_record(session, issue, embedding)
195-
179+
self.insert_new_grouping_record(session, issue, embedding)
196180
session.commit()
197181

198-
similarity_response = SimilarityResponse(responses=[], token=token)
182+
similarity_response = SimilarityResponse(responses=[])
199183
for record, distance in results:
200184
message_similarity_score = difflib.SequenceMatcher(
201185
None, issue.message, record.message
@@ -204,7 +188,8 @@ def get_nearest_neighbors(self, issue: GroupingRequest) -> SimilarityResponse:
204188

205189
similarity_response.responses.append(
206190
GroupingResponse(
207-
parent_group_id=record.group_id,
191+
parent_group_id=record.group_id if hasattr(record, "group_id") else None,
192+
parent_hash=record.hash,
208193
stacktrace_distance=distance,
209194
message_distance=1.0 - message_similarity_score,
210195
should_group=should_group,
@@ -215,7 +200,7 @@ def get_nearest_neighbors(self, issue: GroupingRequest) -> SimilarityResponse:
215200

216201
def insert_new_grouping_record(
217202
self, session, issue: GroupingRequest, embedding: np.ndarray
218-
) -> int:
203+
) -> None:
219204
"""
220205
Inserts a new GroupingRecord into the database if the group_id does not already exist.
221206
If new grouping record was created, return the id.
@@ -224,22 +209,14 @@ def insert_new_grouping_record(
224209
:param issue: The issue to insert as a new GroupingRecord.
225210
:param embedding: The embedding of the stacktrace.
226211
"""
227-
existing_record = None
228-
if issue.group_id:
229-
existing_record = (
230-
session.query(DbGroupingRecord).filter_by(group_id=issue.group_id).first()
231-
)
212+
existing_record = session.query(DbGroupingRecord).filter_by(hash=issue.hash).first()
232213

233214
if existing_record is None:
234215
new_record = GroupingRecord(
235-
group_id=issue.group_id,
216+
group_id=issue.group_id if hasattr(issue, "group_id") else None,
236217
project_id=issue.project_id,
237218
message=issue.message,
238219
stacktrace_embedding=embedding,
239-
stacktrace_hash=issue.stacktrace_hash,
220+
hash=issue.hash,
240221
).to_db_model()
241222
session.add(new_record)
242-
session.commit()
243-
return new_record.id
244-
245-
return existing_record.id

src/seer/schemas/seer.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@
8686
GroupingRequest = typing_extensions.TypedDict(
8787
"GroupingRequest",
8888
{
89-
"group_id": int,
89+
"hash": str,
90+
"group_id": typing.Union[int, None],
9091
"project_id": int,
9192
"stacktrace": str,
9293
"message": str,
@@ -102,6 +103,7 @@
102103
"GroupingResponse",
103104
{
104105
"parent_group_id": typing.Union[int, None],
106+
"parent_hash": str,
105107
"stacktrace_similarity": float,
106108
"message_similarity": float,
107109
"should_group": bool,

src/seer/schemas/seer_api.json

+18-3
Original file line numberDiff line numberDiff line change
@@ -386,9 +386,20 @@
386386
"GroupingRequest": {
387387
"properties": {
388388
"group_id": {
389-
"type": "integer",
389+
"anyOf": [
390+
{
391+
"type": "integer"
392+
},
393+
{
394+
"type": "null"
395+
}
396+
],
390397
"title": "Group Id"
391398
},
399+
"hash": {
400+
"type": "string",
401+
"title": "Hash"
402+
},
392403
"project_id": {
393404
"type": "integer",
394405
"title": "Project Id"
@@ -413,7 +424,7 @@
413424
}
414425
},
415426
"type": "object",
416-
"required": ["group_id", "project_id", "stacktrace", "message"],
427+
"required": ["hash", "project_id", "stacktrace", "message"],
417428
"title": "GroupingRequest"
418429
},
419430
"GroupingResponse": {
@@ -429,6 +440,10 @@
429440
],
430441
"title": "Parent Group Id"
431442
},
443+
"parent_hash": {
444+
"type": "string",
445+
"title": "Parent Hash"
446+
},
432447
"stacktrace_similarity": {
433448
"type": "number",
434449
"title": "Stacktrace Similarity"
@@ -444,7 +459,7 @@
444459
},
445460
"type": "object",
446461
"required": [
447-
"parent_group_id",
462+
"parent_hash",
448463
"stacktrace_similarity",
449464
"message_similarity",
450465
"should_group"

0 commit comments

Comments
 (0)