Modified to save in `safetensors` format now

Just want to say thanks for this! I've been trying to use other peoples' code that all use the `transformer_lens` library and it has a bug that stops you loading models in 4bit, seems to have loads of problems with mixed 'cpu' and 'cuda' tensors, and is generally really slow for some reason.

I've modified your code to:

- Load the model in 4bit to get the `refusal_dir`.
- Then reload in `bfloat16` on the 'cpu' device.
- Upcast the operations on `o_proj` and `down_proj` to `float32` for the 'cpu' device (my old Xeon doesn't have native `bfloat16`).
- Downcast the modified tensors back to `bfloat16`.
- Save back as `safetensors` with the original tokenizer.

Only tested on `Mistral-7B-Instruct-v0.2` and `miqu-1-70b-sf`, which both use `llama` tensor names, but can confirm it is working.

```python
import torch
import gc
import random

from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
from tqdm import tqdm

#MODEL_ID = "Mistral-7B-Instruct-v0.2"
MODEL_ID = "miqu-1-70b-sf"

# More samples can help find the direction better.
NUM_PROMPT_SAMPLES = 32

# Used to skip the first and last layers for the modifications.
SKIP_BEGIN_LAYERS = 1  # Don't mess with the first layer.
SKIP_END_LAYERS = 0

# The layer we will use for the refusal_dir calculation will be floor(LAYER_FRACTION_TO_USE * model.layers).
LAYER_FRACTION_TO_USE = 0.6

# Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less.
SCALE_FACTOR = 1.0

torch.inference_mode()
torch.set_default_device("cpu")
torch.set_grad_enabled(False)

# Load the model on the GPU in quantized type if we can.
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    quantization_config=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16),
    low_cpu_mem_usage=True,
    device_map='auto'
)
model.requires_grad_(False)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

layer_idx = int(len(model.model.layers) * LAYER_FRACTION_TO_USE)
print("Layer index for refusal direction: " + str(layer_idx))

with open("harmful.txt", "r") as f:
    harmful = f.readlines()

with open("harmless.txt", "r") as f:
    harmless = f.readlines()

harmful_instructions = random.sample(harmful, min(NUM_PROMPT_SAMPLES, len(harmful)))
harmless_instructions = random.sample(harmless, min(NUM_PROMPT_SAMPLES, len(harmless)))

harmful_toks = [
    tokenizer.apply_chat_template(conversation=[{"role": "user", "content": insn}], add_generation_prompt=True,
                                  return_tensors="pt") for insn in harmful_instructions]
harmless_toks = [
    tokenizer.apply_chat_template(conversation=[{"role": "user", "content": insn}], add_generation_prompt=True,
                                  return_tensors="pt") for insn in harmless_instructions]

bar_generate = tqdm(total = len(harmful_instructions) + len(harmless_instructions), desc = "Generating samples")

# Only return the final hidden state of the layer we care about, and use 'cpu' to save VRAM.
def generate(toks):
    output = model.generate(
        toks.to(model.device),
        use_cache=False,
        max_new_tokens=1,
        return_dict_in_generate=True,
        output_hidden_states=True,
        pad_token_id=tokenizer.eos_token_id
    )
    bar_generate.update(n=1)
    return output.hidden_states[0][layer_idx][:, -1, :].to('cpu') # Final hidden state = -1.

harmful_hidden = [generate(toks) for toks in harmful_toks]
harmless_hidden = [generate(toks) for toks in harmless_toks]

bar_generate.close()

harmful_mean = torch.stack(harmful_hidden).mean(dim=0)
harmless_mean = torch.stack(harmless_hidden).mean(dim=0)

refusal_dir = harmful_mean - harmless_mean
refusal_dir = refusal_dir.squeeze() / refusal_dir.norm()

torch.save(refusal_dir, MODEL_ID.replace("/", "_") + "_refusal_dir.pt")

# Free memory
del model
gc.collect()
torch.cuda.empty_cache()

# Reload the model in CPU memory with bfloat16 data type
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    device_map='cpu'
)
model.requires_grad_(False)

# Make sure it's on the 'cpu' device.
if refusal_dir.device != model.device:
    refusal_dir = refusal_dir.to(model.device)

# Get the language model component and check it's as expected.
lm_model = model.model
assert hasattr(lm_model, 'layers'), "The model does not have the expected structure."

# Check the ranges are valid.
num_layers = len(lm_model.layers)
assert SKIP_BEGIN_LAYERS >= 0, "SKIP_BEGIN_LAYERS must be >= 0."
assert SKIP_END_LAYERS >= 0, "SKIP_END_LAYERS must be >= 0."
assert SKIP_BEGIN_LAYERS + SKIP_END_LAYERS < num_layers, "SKIP_BEGIN_LAYERS + SKIP_END_LAYERS must be < num_layers."

bar_layers = tqdm(total= (num_layers - (SKIP_BEGIN_LAYERS + SKIP_END_LAYERS)) * 2, desc = "Modifying tensors")

# Cast any ops performed on CPU up to float32... If you have newer CPU might be able to use bfloat16 for this.
# NOTE: Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less.
def modify_tensor(tensor_data, refusal_dir, scale_factor: float = 1.0):
    assert scale_factor <= 1.0, "Using a scale_factor of > 1 doesn't make sense..."
    tensor_float32 = tensor_data.to(torch.float32)
    refusal_dir_float32 = refusal_dir.to(torch.float32)
    tensor_float32 -= scale_factor * torch.matmul(torch.outer(refusal_dir_float32, refusal_dir_float32), tensor_float32)
    tensor_modified = tensor_float32.to(torch.bfloat16)
    bar_layers.update(1)
    return torch.nn.Parameter(tensor_modified)

# Modify the 'self_attn.o_proj.weight' and 'mlp.down_proj.weight' in each chosen layer.
# NOTE: These tensors names are speific to "llama" and may need changing.
#       - See here for others: https://github.com/arcee-ai/mergekit/tree/main/mergekit/_data/architectures
for layer_idx in range(SKIP_BEGIN_LAYERS, num_layers - SKIP_END_LAYERS):
    lm_model.layers[layer_idx].self_attn.o_proj.weight = modify_tensor(
        lm_model.layers[layer_idx].self_attn.o_proj.weight.data, refusal_dir, SCALE_FACTOR
    )
    lm_model.layers[layer_idx].mlp.down_proj.weight = modify_tensor(
        lm_model.layers[layer_idx].mlp.down_proj.weight.data, refusal_dir, SCALE_FACTOR
    )

bar_layers.close()

# Save the modified model and original tokenizer
print("Saving modified model (with original tokenizer)...")
#model.save_pretrained("Mistral-7B-Instruct-v0.2-fixed")
#tokenizer.save_pretrained("Mistral-7B-Instruct-v0.2-fixed")
model.save_pretrained("miqu-1-70b-sf-fixed")
tokenizer.save_pretrained("miqu-1-70b-sf-fixed")
```

Beware that I have squeezed `refusal_dir` back to a vector from a `(1, d_hidden)` tensor, so you might need to change your [inference.py](https://github.com/Sumandora/remove-refusals-with-transformers/blob/master/inference.py) code to match:

```python
refusal_dir = refusal_dir.squeeze() / refusal_dir.norm()

torch.save(refusal_dir, MODEL_ID.replace("/", "_") + "_refusal_dir.pt")
```

If you want I can tidy the code up and do a proper pull request or otherwise feel free to copy in whatever bits you might find useful - my runtime for `miqu` has gone from several hours (and around 500-600GB RAM needed!) to a few minutes thanks to the 4bit stuff working, so huge thanks again!

---

I'm actually trying to use this method to remove some of the "positivity" from creative writing models rather than remove refusals, so I will likely be making a lot more modifications now I have the bare bones code working... I suspect my reason for failure so far is the use of `max_new_tokens=1` - this probably works well for refusals as the first word being `"sorry"` or `"sure"` is quite telling (the `Mopey-Mule` model also seems to always start its reply with `"*sigh*"` too), but for creative writing the first word used has little bearing on whether the text is going to be "positive" or "dark", etc.

---

*EDIT: I just changed the code a bit more so `generate()` only returns the hidden state of the layer we care about (and on the 'cpu' device), as otherwise it was saving all layers for all the samples and ended up causing a CUDA OOM error after using up all available VRAM when you increase `NUM_PROMPT_SAMPLES` to much higher values.*

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Modified to save in `safetensors` format now #1

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Modified to save in safetensors format now #1

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions

Modified to save in `safetensors` format now #1