int
xpageoffset;
for
(i = 0; i < 10000; i++) {
for
(xpageoffset = 0; xpageoffset < 10000; xpageoffset++) {
d = 0.0;
for
(k = 0; k < 10000; k++) {
d += a[i + 10000 * k] * b[k + 10000 * xpageoffset];
c[i + 10000 * xpageoffset] = d;
static double
b[100000000];
static double
c[100000000];
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, (MKL_INT)10000,
(MKL_INT)10000, (MKL_INT)10000, 1.0, &a[0], (MKL_INT)10000, &b[0],
(MKL_INT)10000, 0.0, &c[0], (MKL_INT)10000);
#pragma omp parallel for \
num_threads(4 > omp_get_max_threads() ? omp_get_max_threads() : 4) \
private(b_i,yCol,b_r)
for (i = 0; i < 10; i++) {
for (b_i = 0; b_i < 256; b_i++) {
b_r[b_i] = r[i + 10 * b_i];
c_FFTImplementationCallback_doH(b_r, 0, yCol);
for (b_i = 0; b_i < 256; b_i++) {
a[i + 10 * b_i] = yCol[b_i].re;
cfg = coder.gpuConfig('exe');
cfg.GenerateExampleMain = 'GenerateCodeAndCompile';
codegen largeMatrixTest -config cfg -report
cublasDgemm(getCublasGlobalHandle(), CUBLAS_OP_N, CUBLAS_OP_N, 5000, 5000,
5000, (double *)gpu_alpha1, (double *)&(*gpu_a)[0], 5000, (double *)
&(*gpu_b)[0], 5000, (double *)gpu_beta1, (double *)&(*gpu_c)[0],
5000);
cusolverDnDgesvd(getCuSolverGlobalHandle(), 'N', 'N', 5000, 5000, (double *)
&(*gpu_c)[0], 5000, &(*gpu_s)[0], NULL, 1, NULL, 1, (double
*)getCuSolverWorkspaceBuff(), *getCuSolverWorkspaceReq(), &(*gpu_superb)[0],
gpu_info_t);
除了以上提到的内容,如今最热且重度依赖硬件加速的深度学习应用并没在本文中讨论,事实上MATLAB从R2017b就已经开始支持针对深度学习推断生成C/C++代码,并可利用硬件来加速深度学习的推断,包括NVIDIA的桌面与服务器GPU及嵌入式GPU(通过CUDA实现)、ARM Mali GPU与ARM Neon核(通过Arm Compute Library实现),或者利用x86_64处理器的SIMD(SSE/AVX,通过Intel MKL-DNN实现)。在最新的R2020b版本中,Deep Learning HDL Toolbox还可以将训练好的深度学习模型生成为硬件描述语言,从而把深度学习部署到FPGA上。详情可参考MATLAB帮助文档或者咨询MathWorks中国办公室。
function largeMatrixTest()
a = rand(5000, 5000);
b = a * a;
c = sum(a);
s = svd(a);
e = eig(a);
[maxValue, maxPos] = max(a);
tCpu = toc;
fprintf(' Time cost: %f\n', tCpu);
classdef useMyBLAS < coder.BLASCallback
methods (Static)
function updateBuildInfo(buildInfo, ~)
libPath = 'C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl\lib\intel64';
libPriority = '';
libPreCompiled = true;
libLinkOnly = true;
libs = {'mkl_intel_ilp64.lib' 'mkl_intel_thread.lib' 'mkl_core.lib'};
buildInfo.addLinkObjects(libs, libPath, libPriority, libPreCompiled, ...
libLinkOnly);
buildInfo.addLinkObjects('libiomp5md.lib',fullfile(matlabroot,'bin', ...
'win64'), libPriority, libPreCompiled, libLinkOnly);
buildInfo.addIncludePaths('C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2020.1.216\windows\mkl\include');
buildInfo.addDefines('-DMKL_ILP64');
function headerName = getHeaderFilename()
headerName = 'mkl_cblas.h';
function intTypeName = getBLASIntTypeName()
intTypeName = 'MKL_INT';
function doubleComplexTypeName = getBLASDoubleComplexTypeName()
doubleComplexTypeName = 'my_double_complex_type';
function singleComplexTypeName = getBLASSingleComplexTypeName()
singleComplexTypeName = 'my_single_complex_type';
function p = useEnumNameRatherThanTypedef()
p = true;
classdef useMyLAPACK < coder.LAPACKCallback
methods (Static)
function hn = getHeaderFilename()
hn = 'mkl_lapacke.h';
function updateBuildInfo(buildInfo, buildctx)
buildInfo.addIncludePaths(fullfile(pwd,'include'));
libName = 'mkl_lapack95_ilp64';
libPath = 'C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl\lib\intel64';
[~,linkLibExt] = buildctx.getStdLibInfo();
buildInfo.addLinkObjects([libName linkLibExt], libPath, ...
'', true, true);
buildInfo.addIncludePaths('C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2020.1.216\windows\mkl\include');
buildInfo.addDefines('HAVE_LAPACK_CONFIG_H');
buildInfo.addDefines('LAPACK_COMPLEX_STRUCTURE');
buildInfo.addDefines('LAPACK_ILP64');
cfg = coder.config('exe');
cfg.CustomBLASCallback = 'useMyBLAS';
cfg.CustomLAPACKCallback = 'useMyLAPACK';
cfg.GenerateExampleMain = 'GenerateCodeAndCompile';
codegen largeMatrixTest -config cfg -report