63 lines
1.6 KiB
Python
63 lines
1.6 KiB
Python
import torch
|
|
import torch.nn as nn
|
|
import time
|
|
|
|
# import bitsandbytes as bnb
|
|
from bitsandbytes.nn import Linear8bitLt
|
|
|
|
# set default to half
|
|
import torch
|
|
torch.set_default_dtype(torch.float16)
|
|
|
|
M = 2048
|
|
N = 2048
|
|
|
|
bsz = 2048
|
|
import torch_int
|
|
from torch_int.nn.linear import W8A8BFP32OFP32Linear, W8A8B8O8Linear, W8A8B8O8LinearReLU
|
|
|
|
fp16_model = nn.Sequential(
|
|
nn.Linear(M, N),
|
|
# nn.Linear(2048, 2048)
|
|
)
|
|
|
|
int8_model = nn.Sequential(
|
|
Linear8bitLt(M, N, has_fp16_weights=False),
|
|
# Linear8bitLt(2048, 2048, has_fp16_weights=False)
|
|
)
|
|
|
|
int8_model.load_state_dict(fp16_model.state_dict())
|
|
int8_model = int8_model.to(0) # Quantization happens here
|
|
fp16_model = fp16_model.to(0) # Move fp16 model to GPU as well
|
|
|
|
# Create random input tensor
|
|
input_tensor = torch.randn(bsz, M, device=0) # Batch of 1000 vectors
|
|
|
|
# Speed test function
|
|
def speed_test(model, input_tensor, name, num_iterations=100):
|
|
# Warmup
|
|
for _ in range(10):
|
|
_ = model(input_tensor)
|
|
|
|
# Actual timing
|
|
torch.cuda.synchronize()
|
|
start_time = time.time()
|
|
|
|
for _ in range(num_iterations):
|
|
_ = model(input_tensor)
|
|
|
|
torch.cuda.synchronize()
|
|
end_time = time.time()
|
|
|
|
avg_time = (end_time - start_time) / num_iterations
|
|
print(f"{name} model: {avg_time:.6f} seconds per iteration")
|
|
return avg_time
|
|
|
|
# Run speed tests
|
|
with torch.no_grad(): # Disable gradient calculation for inference
|
|
fp16_time = speed_test(fp16_model, input_tensor, "FP16")
|
|
int8_time = speed_test(int8_model, input_tensor, "INT8")
|
|
|
|
# Calculate speedup
|
|
speedup = fp16_time / int8_time
|
|
print(f"INT8 is {speedup:.2f}x faster than FP16") |