improve readability, logging, fix spellcheck test

subramen · subramen · commit 713206fdab4c · 2024-09-10T13:57:54.000-04:00
diff --git a/.github/scripts/spellcheck_conf/wordlist.txt b/.github/scripts/spellcheck_conf/wordlist.txt
@@ -1451,3 +1451,7 @@ openhathi
 sarvam
 subtask
 acc
+Triaging
+matplotlib
+remediations
+walkthrough
diff --git a/recipes/use_cases/github_triage/llm.py b/recipes/use_cases/github_triage/llm.py
@@ -4,6 +4,7 @@
 import time
 import json
 
+from tqdm import tqdm
 from openai import OpenAI
 import groq
 
@@ -37,7 +38,7 @@ def chat(
             )
             output = response.choices[0].message
         except Exception as e:
-            log.error(
+            logger.error(
                 f"FAILED to generate inference for input {inputs}\nError: {str(e)}"
             )
             output = None
@@ -85,7 +86,8 @@ def chat(
                 print(f"[groq] waiting for {wait} to prevent ratelimiting")
                 time.sleep(wait)
             except Exception as e:
-                logger.error(f"INFERENCE FAILED with Error: {e.response.status_code}! for input:\n{inputs[-1]['content'][:300]}")
+                logger.error(f"INFERENCE FAILED with Error: {e.response.status_code} for input:\n{inputs[-1]['content'][:300]}")
+                break
 
         return output
 
@@ -141,7 +143,8 @@ def run_llm_inference(
         )
 
     responses = [
-        LLM.chat(i, generation_kwargs, guided_decode_json_schema) for i in inputs
+        LLM.chat(i, generation_kwargs, guided_decode_json_schema) 
+        for i in tqdm(inputs, desc=f"Inference[{prompt_name}]")
     ]
 
     if guided_decode_json_schema is not None:
@@ -159,4 +162,4 @@ def run_llm_inference(
     if not _batch:
         responses = responses[0]
 
-    return responses
+    return responses
diff --git a/recipes/use_cases/github_triage/triage.py b/recipes/use_cases/github_triage/triage.py
@@ -68,7 +68,7 @@ def _categorize_issues(
         }
         return themes, theme_count
 
-    logger.info(f"Generating annotations for {len(issues_df)}")
+    logger.info(f"Generating annotations for {len(issues_df)} issues")
     
     discussions = issues_df["discussion"].tolist()
     metadata = run_llm_inference(
diff --git a/recipes/use_cases/github_triage/utils.py b/recipes/use_cases/github_triage/utils.py
@@ -31,7 +31,6 @@ def fetch_repo_issues(repo, start_date=None, end_date=None):
     url = f"https://api.github.com/search/issues?per_page=100&sort=created&order=asc&q=repo:{repo}+is:issue{time_filter}"
 
     samples = []
-    logger.info(f"Fetching issues on {repo} from {start_date} to {end_date}")
 
     while True:
         response = fetch_github_endpoint(url)
@@ -61,8 +60,7 @@ def fetch_repo_issues(repo, start_date=None, end_date=None):
             else:
                 break
         else:
-            raise Exception(f"Fetching issues failed with Error: {response.status_code}")
-        print()
+            raise Exception(f"Fetching issues failed with Error: {response.status_code} on url {url}")
         
     rows = [{
         "repo_name": repo,
@@ -93,12 +91,8 @@ def fetch_repo_stats(repo):
 
 def validate_df_values(df, out_folder=None, name=None):
     df.columns = df.columns.str.lower().str.replace(" ", "_").str.replace("-", "_")
-    # for c in df.columns:
-    #     x = df[c].iloc[0]
-    #     if isinstance(x, str) and '[' in x:
-    #         df[c] = df[c].apply(lambda x: eval(x))
     if out_folder is not None:
         path = f"{out_folder}/{name}.csv"
         df.to_csv(path, index=False)
         logger.info(f"Data saved to {path}")
-    return df
+    return df
diff --git a/recipes/use_cases/github_triage/walkthrough.ipynb b/recipes/use_cases/github_triage/walkthrough.ipynb

Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,7 @@ def _categorize_issues(`
`68`	`68`	`}`
`69`	`69`	`return themes, theme_count`
`70`	`70`
`71`		`- logger.info(f"Generating annotations for {len(issues_df)}")`
	`71`	`+ logger.info(f"Generating annotations for {len(issues_df)} issues")`
`72`	`72`
`73`	`73`	`discussions = issues_df["discussion"].tolist()`
`74`	`74`	`metadata = run_llm_inference(`