3 files changed, 20 insertions, 7 deletions
diff --git a/classification/.gitignore b/classification/.gitignore
index 53752db2..d847c80b 100644
--- a/classification/.gitignore
+++ b/classification/.gitignore
@@ -1 +1,2 @@
 output
+reasoning
diff --git a/classification/classifier.py b/classification/classifier.py
index 43f5c13c..7b08439a 100755
--- a/classification/classifier.py
+++ b/classification/classifier.py
@@ -4,6 +4,7 @@ from datetime import timedelta
 from time import monotonic
 from argparse import ArgumentParser
 from ollama import chat, ChatResponse
+from re import sub
 
 parser = ArgumentParser(prog='classifier.py')
 parser.add_argument('-f', '--full', action='store_true', help="use whole dataset")
@@ -15,17 +16,20 @@ args = parser.parse_args()
 
 positive_categories = ['semantic', 'TCG', 'assembly', 'architecture', 'mistranslation', 'register', 'user-level']
 architectures = ['x86', 'arm', 'risc-v', 'i386', 'ppc']
-negative_categories = ['boot', 'network', 'KVM', 'vnc', 'graphic', 'device', 'socket', 'debug', 'files', 'PID', 'permissions', 'performance', 'kernel', 'peripherals', 'VMM', 'hypervisor', 'virtual' ]
+negative_categories = ['boot', 'network', 'KVM', 'vnc', 'graphic', 'device', 'socket', 'debug', 'files', 'PID', 'permissions', 'performance', 'kernel', 'peripherals', 'VMM', 'hypervisor', 'virtual', 'other']
 categories = positive_categories + negative_categories + architectures
 
-def list_files_recursive(directory):
+def list_files_recursive(directory, basename = False):
     result = []
     for entry in listdir(directory):
         full_path = path.join(directory, entry)
         if path.isdir(full_path):
-            result = result + list_files_recursive(full_path)
+            result = result + list_files_recursive(full_path, basename)
         else:
-            result.append(full_path)
+            if basename:
+                result.append(path.basename(full_path))
+            else:
+                result.append(full_path)
     return result
 
 def output(text : str, category : str, labels : list, scores : list, identifier : str, reasoning : str = None):
@@ -106,24 +110,32 @@ def main():
         with open("preambel", "r") as file:
             preambel = file.read()
 
+    processed_bugs = list_files_recursive("output", True)
     bugs = list_files_recursive("../results/scraper/mailinglist")
     bugs = []
     if not args.full:
         bugs = bugs + list_files_recursive("../results/scraper/gitlab/semantic_issues")
         bugs = bugs + [ "../results/scraper/launchpad/1809546", "../results/scraper/launchpad/1156313" ]
     else:
-        bugs = bugs + list_files_recursive("../results/scraper/launchpad")
+        bugs = bugs + list_files_recursive("../results/scraper/launchpad-without-comments")
         bugs = bugs + list_files_recursive("../results/scraper/gitlab/issues_text")
 
     print(f"{len(bugs)} number of bugs will be processed")
     for i, bug in enumerate(bugs):
         print(f"Bug: {bug}, Number: {i+1},", end=" ")
+
+        if path.basename(bug) in processed_bugs:
+            print("skipped")
+            continue
+
         with open(bug, "r") as file:
             text = file.read()
 
         if args.deepseek:
             response = chat(args.deepseek, [{'role': 'user', 'content': preambel + "\n" + text,}])
-            category = response['message']['content'].split()[-1].strip("* ")
+            category = sub(r'[^a-zA-Z]', '', response['message']['content'].split()[-1])
+            if not category in categories:
+                category = "manual-review"
             output(text, category, [], [], path.basename(bug), response['message']['content'])
         else:
             result = classifier(text, categories, multi_label=args.multi_label)
diff --git a/classification/preambel b/classification/preambel
index c85f8d25..3f3cc741 100644
--- a/classification/preambel
+++ b/classification/preambel
@@ -2,7 +2,7 @@ Classify the following bug report. it is part of qemu.
 
 These are the possible categories:
 
-mistranslation: incorrect semantic mapping from source architecture to IR/target, which happen in user-mode
+mistranslation: incorrect semantic mapping from source architecture to IR/target, which happens in user-mode
 assembly: assembly lowering
 other: other
 device