lenet5

在这里插入图片描述
说明:由于fpga资源有限,所以120全连接层替换位60, 全连接层84替换成16。

模型构建PyTorch代码

# Load in relevant libraries, and alias where appropriate
import time
import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torchsummary import summary
import matplotlib.pyplot as plt
# Define relevant variables for the ML task
batch_size = 64
num_classes = 10
learning_rate = 0.001
num_epochs = 20# Device will determine whether to run the training on GPU or CPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#Loading the dataset and preprocessing
train_dataset = torchvision.datasets.MNIST(root = './data',train = True,transform = transforms.Compose([transforms.Resize((32,32)),transforms.ToTensor(),
#                   transforms.Normalize(mean = (0.1307,), std = (0.3081,))]),download = True)test_dataset = torchvision.datasets.MNIST(root = './data',train = False,transform = transforms.Compose([transforms.Resize((32,32)),transforms.ToTensor(),
#                   transforms.Normalize(mean = (0.1325,), std = (0.3105,)]),download=True)train_loader = torch.utils.data.DataLoader(dataset = train_dataset,batch_size = batch_size,shuffle = True)test_loader = torch.utils.data.DataLoader(dataset = test_dataset,batch_size = batch_size,shuffle = True)
#Defining the convolutional neural network
class LeNet5(nn.Module):def __init__(self, num_classes):super(LeNet5, self).__init__()self.layer1 = nn.Sequential(nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0),nn.AvgPool2d(kernel_size = 2, stride = 2))self.layer2 = nn.Sequential(nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0),nn.AvgPool2d(kernel_size = 2, stride = 2))self.fc = nn.Linear(400, 60)self.fc1 = nn.Linear(60, 16)self.fc2 = nn.Linear(16, num_classes)self.sigmoid = nn.Sigmoid()def forward(self, x):out = self.layer1(x)out = self.layer2(out)out = out.reshape(out.size(0), -1)out = self.fc(out)out = self.sigmoid(out)out = self.fc1(out)out = self.sigmoid(out)out = self.fc2(out)return outmodel = LeNet5(num_classes).to(device)#Setting the loss function
cost = nn.CrossEntropyLoss()#Setting the optimizer with the model parameters and learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)#this is defined to print how many steps are remaining when training
total_step = len(train_loader)for epoch in range(num_epochs):for i, (images, labels) in enumerate(train_loader):  images = images.to(device)labels = labels.to(device)#Forward passoutputs = model(images).cuda()loss = cost(outputs, labels)# Backward and optimizeoptimizer.zero_grad()loss.backward()optimizer.step()if (i+1) % 400 == 0:print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)with torch.no_grad():correct = 0total = 0for images, labels in test_loader:images = images.to(device)labels = labels.to(device)outputs = model(images).cuda()_, predicted = torch.max(outputs.data, 1)total += labels.size(0)correct += (predicted == labels).sum().item()print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total))#network archtecture    
print(summary(LeNet5(num_classes).cuda(),input_size=[(1,32,32)]))
#save model
torch.save(model, "model.pth")
#load model
model_path = "model.pth"
model = torch.load(model_path,map_location=torch.device('cpu'))
param = {}
for name, parameters in model.cpu().state_dict().items():param[name] = parameters.detach().numpy()
#get a batch of images
img_iter = iter(test_loader)
images = None
labels = None
for img, label in img_iter:images = imglabels = labelbreak
images = images.numpy()
labels = labels.numpy()
#show the images
i = 0
fig = plt.figure(figsize=(10,10))
for label, img in zip(labels[:9], images[:9]):i += 1ax = fig.add_subplot(3,3,i)ax.title.set_text("label: " + str(label))plt.imshow(img.reshape(img.shape[1],img.shape[2]))for label, img in zip(labels[:9], model(torch.tensor(images[:9]))):print("The truth: ", label, "predicted label: ", img.argmax().tolist())
#test inference time 
start = time.time()
pred_label = model(torch.tensor(images[:1])).argmax().tolist()
end = time.time() - start
print("The predicted label is: ", pred_label, " the inference time is: ", str(end * 1000) + "ms.")
#extract the images
for label, img in zip(labels[:9], images[:9]):shape = img.shapeimg = img.astype(np.float32)pic_name = "input_" + str(label) with open(pic_name + ".h", "w") as f:new_str2 = str(img.tolist())new_str2 = new_str2.replace("[","")new_str2 = new_str2.replace("]","")f.write("float " + pic_name + "[" + str(shape[1]*shape[2]) + "]" + " = {" + new_str2 + "};\n\n")
#extract the model parameters
for key in param.keys():shape = param[key].shapenew_str1 = str(param[key].tolist())new_str1 = new_str1.replace("[", "")new_str1 = new_str1.replace("]", "")if len(shape) == 1:with open(key.replace(".", "_")+".h", "w") as f:f.write("float " + key.replace(".", "_")+"[" + str(shape[0])+"]" + " = {"+ new_str1 + "};\n\n")elif len(shape) == 2:with open(key.replace(".", "_")+".h", "w") as f:f.write("float " + key.replace(".", "_")+"[" + str(shape[0] * shape[1])+"]" + " = {"+ new_str1 + "};\n\n")else:with open(key.replace(".", "_")+".h", "w") as f:f.write("float " + key.replace(".", "_")+"[" + str(shape[0] * shape[1] * shape[2] * shape[3])+"]" + " = {"+ new_str1 + "};\n\n")print(key+" save successfully!")

hls代码

#include "HLS/hls.h"
#include "stdio.h"float expf(float x) {x = 1.0 + x / 1024;x *= x; x *= x; x *= x; x *= x; x *= x; x *= x; x *= x; x *= x; x *= x; x *= x; return x;
}
// 输入四个图像数据,四个数据相加,求平均值得到一个结果。
float AvgPool_2x2(float input[4]){float res = 0;int i;for(i = 0; i < 4 ; i++){res += input[i];}res /= 4;return res;
}float sigmoid(float x)
{return (1 / (1 + expf(-x)));
}float Conv_5x5(float input[25], float kernel[25]){int x,y;float result = 0;for(y = 0; y < 5; y++){for(x = 0; x < 5; x++){result += input[x+y*5] * kernel[x+y*5];// result += input[x+y*5] * kernel[x*5+y];}}return result;
}//kernel 5x5x6 = 25x6 = 150
void ConvLayer_1(float input[1024],float * C1_value,float * weights,float * bias){int i_y,i_x,matrix_y,matrix_x;  // 循环变量和中间变量int k_num,mat_i = 0;            // 不同kernel的计数变量// 定义循环名称为top_loop,方便优化// k_num为不同卷积核的循环变量,第一层有6个不同的卷积核,自然循环6次产生6张feature mapfor(int k_num = 0; k_num < 6; k_num+=1){// 卷积核的权重数据变量matrix_2,用来存放kernel数据的// 第一层C1有6个不同的kernel,利用外层的top_loop可以完成对这些不同的kernel进行赋值// 5x5的kernel有25个变量,循环25次来放float matrix_2[25];for(mat_i = 0;mat_i<25;mat_i++){matrix_2[mat_i] = weights[mat_i + k_num*25];}// 一次卷积核操作,完成28*28的输出for(i_y = 0; i_y < 28; i_y++){for(i_x = 0; i_x < 28; i_x++){float matrix[25];int pic_value_index = i_x + i_y * 32;// 通过25次循环,先把一个点乘运算所需的输入图像数据给弄出来,放到matrix中for(matrix_y = 0; matrix_y <5; matrix_y++){for(matrix_x = 0; matrix_x <5; matrix_x++){// 图片索引是0 ~ 24,这个也好理解,有25个数据int matrix_index = matrix_x + matrix_y * 5;// 图片像素索引 0 ~ 1024。与matrix_x,matrix_y相关,x、y=32。int input_value_index = pic_value_index + matrix_x + matrix_y * 32;matrix[matrix_index] = input[input_value_index];}}// out_pic_index为输出数据的索引,从之前的学习,可以知道// C1后将输出28x28x6=4704个数据,而每个数据,可以由out_pic_index索引int out_pic_index = i_x + i_y * 28 + k_num * 784;// 通过最基本的卷积点乘单元,输入为图像数据和卷积核值。C1_value[out_pic_index] = Conv_5x5(matrix,matrix_2) + bias[k_num] ;//}}}
}void AvgpoolLayer_2(float input[4704],float *A2_value){int k_num,i_y,i_x,matrix_x,matrix_y;int count = 0;// 有6张feature map,需要循环6次for(k_num = 0; k_num < 6; k_num++){// 一张feature map大小为28x28,需要把每个数据给遍历了for(i_y = 0; i_y < 27; i_y+=2){for(i_x = 0;  i_x < 27; i_x+=2){float matrix[4];// 此时28x28x6某个图像数据的索引int index_now = i_x + i_y * 28 + k_num * 784;// 2x2的区域内,做一个平均池化for(matrix_y = 0; matrix_y < 2; matrix_y++){for(matrix_x = 0; matrix_x < 2; matrix_x++){// 将输出的索引转化为输入图像数据的索引,类似一个值映射变为四个值// 把四个索引里的值放到之前定义的matrix变量中,用于计算。int input_index = index_now + matrix_x + matrix_y * 28 ;matrix[matrix_x + matrix_y*2] = input[input_index];}}// 将28x28个数据遍历完,计算四个产生的一个结果,具体是四个数据相加// A2_value[count] = sigmoid(AvgPool_2x2(matrix));A2_value[count] = AvgPool_2x2(matrix);count++;  // 计数变量增加,来完成28x28遍历下的所有输出}}}
}//kernel 5x5x6x16 = 25x6x16 =2400
void ConvLayer_3(float input[1176],float *C3_value,float * weights,float * bias){int k_num,nk_num,i_y,i_x,matrix_x,matrix_y;int mat_i;int i=0;for(nk_num = 0; nk_num < 16; nk_num++){for(i_y = 0; i_y < 10; i_y++){for(i_x = 0; i_x < 10; i_x++){float res = 0;float res_total_6 = 0;float matrix[25];int index_now = i_x + i_y * 10 + nk_num * 100;for(k_num = 0; k_num < 6; k_num++){float matrix_2[25];int input_index_now = k_num*14*14+ i_x + i_y*14;for(mat_i = 0;mat_i<25;mat_i++){// int weights_index = mat_i + k_num*25 + (nk_num+1)*150;int weights_index = mat_i + k_num*25 + (nk_num)*150;matrix_2[mat_i] = weights[weights_index];}for(matrix_y = 0; matrix_y <5; matrix_y++){for(matrix_x = 0; matrix_x <5; matrix_x++){int matrix_index = matrix_x + matrix_y * 5;int input_value_index = input_index_now + matrix_x + matrix_y * 14;matrix[matrix_index] = input[input_value_index];}}res_total_6 += Conv_5x5(matrix,matrix_2)  ;i++;}C3_value[index_now] = res_total_6 + bias[nk_num];}}}
}
//10x10x16
void AvgpoolLayer_4(float input[1600],float *A4_value){int k_num,i_y,i_x,matrix_x,matrix_y;int count = 0;for(k_num = 0; k_num < 16; k_num++){for(i_y = 0; i_y < 10; i_y+=2){for(i_x = 0;  i_x < 10; i_x+=2){float matrix[4];int index_now = i_x + i_y * 10 + k_num * 100;for(matrix_y = 0; matrix_y < 2; matrix_y++){for(matrix_x = 0; matrix_x < 2; matrix_x++){int input_index = index_now + matrix_x + matrix_y * 10 ;matrix[matrix_x + matrix_y*2] = input[input_index];}}// A4_value[count] = sigmoid(AvgPool_2x2(matrix));A4_value[count] = AvgPool_2x2(matrix);count++;}}}
}//kernel 400x120 = 48000
void FullyConnLayer_5(float input[400],float *F5_value,float * weights, float * bias){int i_y,i_x;for(i_y = 0; i_y < 60; i_y++){float res = 0;for(i_x = 0;  i_x < 400; i_x++){int index = i_x + i_y * 400;res += input[i_x] * weights[index];}F5_value[i_y] = sigmoid(res + bias[i_y]);}
}
//kernel 84x120 = 10080
void FullyConnLayer_6(float input[60],float *F6_value,float * weights, float * bias){int i_y,i_x;for(i_y = 0; i_y < 16; i_y++){float res = 0;for(i_x = 0;  i_x < 60; i_x++){int index = i_x + i_y * 60;res += input[i_x] * weights[index];}F6_value[i_y] = sigmoid(res + bias[i_y]);}
}
//kernel 10x120 = 1200
void FullyConnLayer_7(float input[16],float *F7_value,float * weights, float * bias){int i_y,i_x;for(i_y = 0; i_y < 10; i_y++){float res = 0;for(i_x = 0;  i_x < 16; i_x++){int index = i_x + i_y * 16;res += input[i_x] * weights[index];}F7_value[i_y] = res + bias[i_y];}
}hls_avalon_slave_component
component int LeNet5(hls_avalon_slave_memory_argument(1024 *sizeof(float))  float   *img,hls_avalon_slave_memory_argument(150  *sizeof(float))  float   *c1_weight,hls_avalon_slave_memory_argument(6    *sizeof(float))  float   *c1_bias,hls_avalon_slave_memory_argument(2400 *sizeof(float))  float   *c3_weight,hls_avalon_slave_memory_argument(16   *sizeof(float))  float   *c3_bias,hls_avalon_slave_memory_argument(24000*sizeof(float))  float   *c5_weight,hls_avalon_slave_memory_argument(60   *sizeof(float))  float  *c5_bias,hls_avalon_slave_memory_argument(960  *sizeof(float))  float *f6_weight,hls_avalon_slave_memory_argument(16   *sizeof(float))  float   *f6_bias,hls_avalon_slave_memory_argument(160   *sizeof(float))   float  *f7_weight,hls_avalon_slave_memory_argument(10    *sizeof(float))  float  *f7_bias){//The output of each layerfloat C1_value[4704];//28x28x6float A2_value[1176];//14x14x6float C3_value[1600];//10x10x16float A4_value[400];//5x5x16float F5_value[60];float F6_value[16];float F7_value[10];int i, ret;float temp = 0.0;// calulation of each layerConvLayer_1(img,C1_value,c1_weight,c1_bias);AvgpoolLayer_2(C1_value,A2_value);ConvLayer_3(A2_value,C3_value,c3_weight,c3_bias);AvgpoolLayer_4(C3_value,A4_value);FullyConnLayer_5(A4_value,F5_value,c5_weight,c5_bias);FullyConnLayer_6(F5_value,F6_value,f6_weight,f6_bias);FullyConnLayer_7(F6_value,F7_value,f7_weight,f7_bias);// printf("%f,%f\n",F7_value[0],F7_value[1]);for(i = 0; i < 10; i++){if(F7_value[i] > temp){temp = F7_value[i];ret = i;}}return ret;}

hps端代码

/** main.c**  Created on: 2022年7月24日*      Author: 86130*/#include 
#include 
#include 
#include 
#include 
#include 
#include 
#define soc_cv_av
#include "hwlib.h"
#include "socal/socal.h"
#include "socal/hps.h"
#include "hps_0.h"
#include "layer1_0_weight.h"
#include "layer1_0_bias.h"
#include "layer2_0_weight.h"
#include "layer2_0_bias.h"
#include "fc_weight.h"
#include "fc_bias.h"
#include "fc1_weight.h"
#include "fc2_weight.h"
#include "fc1_bias.h"
#include "fc2_bias.h"
#include "input_0.h"
#include "input_1.h"
#include "input_2.h"
#include "input_3.h"
#include "input_4.h"
#include "input_5.h"
#include "input_6.h"
#include "input_7.h"
#include "input_8.h"
#include "input_9.h"
#define HW_REGS_BASE (ALT_STM_OFST) //HPS 外设地址段基地址
#define HW_REGS_SPAN (0x04000000) //HPS 外设地址段地址空间 64MB 大小
#define HW_REGS_MASK (HW_REGS_SPAN - 1) //HPS 外设地址段地址掩码
//接口定义(结构体的方式)
typedef struct{volatile float *img;volatile float *c1_w;volatile float *c1_b;volatile float *c3_w;volatile float *c3_b;volatile float *c5_w;volatile float *c5_b;volatile float *f6_w;volatile float *f6_b;volatile float *f7_w;volatile float *f7_b;
}fc_port_def;
fc_port_def fc_port;
typedef struct{volatile long long busy;volatile long long start;volatile long long ire_en;volatile long long done;volatile long long result;
}fc_ctrl_def;
fc_ctrl_def *fc_ctrl;int fc_init(void *virtual_base){void *fc_ctrl_addr;fc_ctrl_addr = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +LENET5_0_LENET5_INTERNAL_INST_AVS_CRA_BASE) & (unsigned long)(HW_REGS_MASK));fc_ctrl = (fc_ctrl_def*)fc_ctrl_addr; //接口映射fc_ctrl->start = 0;fc_port.img = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +LENET5_0_LENET5_INTERNAL_INST_AVS_IMG_BASE) & (unsigned long)(HW_REGS_MASK));fc_port.c1_w = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +LENET5_0_LENET5_INTERNAL_INST_AVS_C1_WEIGHT_BASE) & (unsigned long)(HW_REGS_MASK));fc_port.c1_b = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +LENET5_0_LENET5_INTERNAL_INST_AVS_C1_BIAS_BASE) & (unsigned long)(HW_REGS_MASK));fc_port.c3_w = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +LENET5_0_LENET5_INTERNAL_INST_AVS_C3_WEIGHT_BASE) & (unsigned long)(HW_REGS_MASK));fc_port.c3_b = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +LENET5_0_LENET5_INTERNAL_INST_AVS_C3_BIAS_BASE) & (unsigned long)(HW_REGS_MASK));fc_port.c5_w = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +LENET5_0_LENET5_INTERNAL_INST_AVS_C5_WEIGHT_BASE) & (unsigned long)(HW_REGS_MASK));fc_port.c5_b = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +LENET5_0_LENET5_INTERNAL_INST_AVS_C5_BIAS_BASE) & (unsigned long)(HW_REGS_MASK));fc_port.f6_w = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +LENET5_0_LENET5_INTERNAL_INST_AVS_F6_WEIGHT_BASE) & (unsigned long)(HW_REGS_MASK));fc_port.f6_b = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +LENET5_0_LENET5_INTERNAL_INST_AVS_F6_BIAS_BASE) & (unsigned long)(HW_REGS_MASK));fc_port.f7_w = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +LENET5_0_LENET5_INTERNAL_INST_AVS_F7_WEIGHT_BASE) & (unsigned long)(HW_REGS_MASK));fc_port.f7_b = virtual_base + ((unsigned long)(ALT_LWFPGASLVS_OFST +LENET5_0_LENET5_INTERNAL_INST_AVS_F7_BIAS_BASE) & (unsigned long)(HW_REGS_MASK));
//加载权重参数、偏置参数memcpy(fc_port.c1_w,layer1_0_weight,150*sizeof(float));memcpy(fc_port.c1_b,layer1_0_bias,6*sizeof(float));memcpy(fc_port.c3_w,layer2_0_weight,2400*sizeof(float));memcpy(fc_port.c3_b,layer2_0_bias,16*sizeof(float));memcpy(fc_port.c5_w,fc_weight,24000*sizeof(float));memcpy(fc_port.c5_b,fc_bias,60*sizeof(float));memcpy(fc_port.f6_w,fc1_weight,960*sizeof(float));memcpy(fc_port.f6_b,fc1_bias,16*sizeof(float));memcpy(fc_port.f7_w,fc2_weight,160*sizeof(float));memcpy(fc_port.f7_b,fc2_bias,10*sizeof(float));return 0;
}
const float
*imgx[10]={input_0,input_1,input_2,input_3,input_4,input_5,input_6,input_7,input_8,input_9};
int main(){int fd,i;void *virtual_base;float time_s,time_ns,time_ms;struct timespec ts1,ts2;
//1.打开 MMU open()fd = open("/dev/mem",(O_RDWR | O_SYNC));if(fd == (-1)){printf("ERROR:could not open\"/dev/mem\"...\n");return 1;}
//2.将外设地址空间映射到用户空间mmap()virtual_base = mmap(NULL,HW_REGS_SPAN,( PROT_READ |PROT_WRITE ),MAP_SHARED,fd,HW_REGS_BASE);
//3.初始化(一般是自己写的函数 )fc_init(virtual_base);
//4.对外设进行相应的操作while(1){for(i=0;i<10;i++){memcpy(fc_port.img,imgx[i],1024*sizeof(float));clock_gettime(CLOCK_MONOTONIC,&ts1); //记录函数开始时间fc_ctrl->start = 1;//打开推理while((fc_ctrl->done & 0x02) == 0);//当 done 不为 2 的时候(推理未完成),就阻塞(等待)printf("%d",fc_ctrl->done);fc_ctrl->start = 0; //推理完成,关闭使能clock_gettime(CLOCK_MONOTONIC,&ts2); //记录函数结束时间
//由于总的时间=time_s+time_ns
//为了显示方便,将总的时间统一转化为毫秒time_s = ts2.tv_sec - ts1.tv_sec;time_ns = ts2.tv_nsec - ts1.tv_nsec;time_ms = time_s*1000 + time_ns/1000000;printf("predict time:%.6f ms\n",time_ms);printf("input:%d,predict result:%d\n",i,fc_ctrl->result);}break;
}
//5.取消映射munmap()
if(munmap(virtual_base,HW_REGS_SPAN)!=0){printf("ERROR:munmap()failed...\n");close(fd);return 1;
}
//6.关闭设备描述符close()
close(fd);
return 0;
}


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部