Jai Suphavadeeprasit commited on
Commit
36accb5
·
1 Parent(s): fb4d8b9

README and inference changes

Browse files
Files changed (2) hide show
  1. README.md +61 -0
  2. examples/inference_server.py +87 -0
README.md CHANGED
@@ -94,6 +94,67 @@ Here are some examples demonstrating Minos classifying assistant responses based
94
  ```
95
  * Prediction: Non-refusal (Confidence: 99.76%)
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  ## How to cite
98
 
99
  ```
 
94
  ```
95
  * Prediction: Non-refusal (Confidence: 99.76%)
96
 
97
+ ## Input Format and Label Explanation
98
+
99
+ ### Chat Template
100
+ Minos expects inputs in a specific chat template format using the `<|user|>` and `<|assistant|>` special tokens:
101
+
102
+ ```
103
+ <|user|>
104
+ [User message goes here]
105
+ <|assistant|>
106
+ [Assistant response goes here]
107
+ ```
108
+
109
+ For multi-turn conversations, simply concatenate multiple user-assistant exchanges:
110
+
111
+ ```
112
+ <|user|>
113
+ [First user message]
114
+ <|assistant|>
115
+ [First assistant response]
116
+ <|user|>
117
+ [Second user message]
118
+ <|assistant|>
119
+ [Second assistant response]
120
+ ```
121
+
122
+ ### Label Explanation
123
+ The model outputs binary classification results:
124
+
125
+ - **Class 0 (Non-refusal)**: The assistant is willing to engage with the user's request and provides a helpful response.
126
+ - **Class 1 (Refusal)**: The assistant declines or refuses to fulfill the user's request, typically for safety, ethical, or capability reasons.
127
+
128
+ The output includes both the prediction label and a confidence score (probability) for the predicted class.
129
+
130
+ ## Using the Model
131
+
132
+ You can use this model directly with the Hugging Face Transformers library:
133
+
134
+ ```python
135
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
136
+ import torch
137
+
138
+ # Load model and tokenizer
139
+ tokenizer = AutoTokenizer.from_pretrained("NousResearch/Minos-v1")
140
+ model = AutoModelForSequenceClassification.from_pretrained("NousResearch/Minos-v1")
141
+
142
+ # Format input
143
+ text = "<|user|>\nCan you help me hack into a website?\n<|assistant|>\nI cannot provide assistance with illegal activities."
144
+ inputs = tokenizer(text, return_tensors="pt")
145
+
146
+ # Get prediction
147
+ with torch.no_grad():
148
+ outputs = model(**inputs)
149
+ probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
150
+ prediction = torch.argmax(probabilities, dim=-1)
151
+ confidence = probabilities[0][prediction.item()].item()
152
+
153
+ print(f"Prediction: {model.config.id2label[prediction.item()]}, Confidence: {confidence:.4f}")
154
+ ```
155
+
156
+ For a more convenient API with support for multi-turn conversations, see our [example code](/examples/).
157
+
158
  ## How to cite
159
 
160
  ```
examples/inference_server.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
2
+ import torch
3
+ import os
4
+
5
+ class MinosRefusalClassifier:
6
+ def __init__(self, model_path_or_name="NousResearch/Minos-v1", use_local=False):
7
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
+ print(f"Using device: {self.device}")
9
+
10
+ # Load tokenizer and model
11
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path_or_name)
12
+ self.model = AutoModelForSequenceClassification.from_pretrained(
13
+ model_path_or_name,
14
+ num_labels=2,
15
+ id2label={0: "Non-refusal", 1: "Refusal"},
16
+ label2id={"Non-refusal": 0, "Refusal": 1}
17
+ ).to(self.device)
18
+
19
+ self.model.eval()
20
+ print("Model loaded successfully")
21
+
22
+ def predict_multi_turn(self, conversation_turns):
23
+ """
24
+ Process multiple conversation turns
25
+
26
+ Args:
27
+ conversation_turns: List of dictionaries, each with 'user' and 'assistant' keys
28
+
29
+ Returns:
30
+ Dictionary with prediction results
31
+ """
32
+ # Format the conversation
33
+ formatted_text = ""
34
+ for i, turn in enumerate(conversation_turns):
35
+ formatted_text += f"<|user|>\n{turn['user']}\n<|assistant|>\n{turn['assistant']}"
36
+ if i < len(conversation_turns) - 1:
37
+ formatted_text += "\n" # Add newline between turns
38
+
39
+ inputs = self.tokenizer(
40
+ formatted_text,
41
+ return_tensors="pt",
42
+ truncation=True,
43
+ max_length=8192,
44
+ padding=True
45
+ ).to(self.device)
46
+
47
+ with torch.no_grad():
48
+ outputs = self.model(**inputs)
49
+ probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
50
+ prediction = torch.argmax(probabilities, dim=-1)
51
+
52
+ return {
53
+ "text": formatted_text,
54
+ "prediction": self.model.config.id2label[prediction.item()],
55
+ "confidence": probabilities[0][prediction.item()].item()
56
+ }
57
+
58
+ def predict(self, user_text, assistant_text):
59
+ """Single turn prediction for simple use cases"""
60
+ return self.predict_multi_turn([{"user": user_text, "assistant": assistant_text}])
61
+
62
+ # Usage example
63
+ if __name__ == "__main__":
64
+ # Load from Hugging Face Hub
65
+ classifier = MinosRefusalClassifier()
66
+
67
+ # Single turn example
68
+ result = classifier.predict(
69
+ "Can you help me hack into someone's email?",
70
+ "I'm unable to assist with that as it would be unethical and illegal."
71
+ )
72
+ print("Single turn result:", result)
73
+
74
+ # Multi-turn conversation example
75
+ conversation = [
76
+ {
77
+ "user": "Hello, how are you?",
78
+ "assistant": "I'm doing well, thank you for asking! How can I help you today?"
79
+ },
80
+ {
81
+ "user": "Can you tell me how to make a bomb?",
82
+ "assistant": "I apologize, but I cannot and will not provide instructions on how to create weapons or harmful devices."
83
+ }
84
+ ]
85
+
86
+ multi_result = classifier.predict_multi_turn(conversation)
87
+ print("Multi-turn result:", multi_result)