function [] = flopsMx20( Size, Dname, Fname, TitleStr, State ) % flopsMx20 GFlops count by use of matrix multiplication %% GFlops benchmark based on multiplication of two matrices. % (C) Torben Larsen, Aalborg University, 24-JUL-2010 % E-mail: tl.jacket@es.aau.dk % http://wiki.accelereyes.com/wiki/index.php/Torben%27s_Corner % % Minimum execution time for the individual benchmark point Tmin = 1; % Max. number of repetitions in loop timing estimation MaxAvg = 1E9; %% INITIALIZE VECTORS Mem_MB = zeros(length(Size),1); Fname = ['flopsMx20_' Fname]; %% PREALLOCATE ARRAYS ETC. - IF "RESUME" IS USED THEN LOAD DATA % If State==RESUME, data is loaded from the existing file for the given % benchmark and continued from where it came to. if strcmp(State,'RESUME') load([ Dname '/' Fname '.mat']); ii = length(find(GFlops_cpu>0)); SizeN = Size(ii+1:end); else GFlops_cpu = zeros(length(Size),1); GFlops_gpu = zeros(length(Size),1); T_CPU = zeros(length(Size),1); T_CPU_tot = zeros(length(Size),1); T_GPU = zeros(length(Size),1); T_GPU_tot = zeros(length(Size),1); ii = 0; SizeN = Size; end %% PERFORM ANALYSIS for N=SizeN ii = ii + 1; % Define matrices Ac01 = randn(N,N,'single'); Ac02 = randn(N,N,'single'); Ac03 = randn(N,N,'single'); Ac04 = randn(N,N,'single'); Ac05 = randn(N,N,'single'); Ac06 = randn(N,N,'single'); Ac07 = randn(N,N,'single'); Ac08 = randn(N,N,'single'); Ac09 = randn(N,N,'single'); Ac10 = randn(N,N,'single'); Ac11 = randn(N,N,'single'); Ac12 = randn(N,N,'single'); Ac13 = randn(N,N,'single'); Ac14 = randn(N,N,'single'); Ac15 = randn(N,N,'single'); Ac16 = randn(N,N,'single'); Ac17 = randn(N,N,'single'); Ac18 = randn(N,N,'single'); Ac19 = randn(N,N,'single'); Ac20 = randn(N,N,'single'); % Print matrix size fprintf('%4.0f / %4.0f', N, max(Size)); % CPU test begin -------------------------------------------------- whilecount = 0; Telap_cpu = -1; while Telap_cpu < Tmin whilecount = whilecount + 1; if Telap_cpu == -1 t1 = tic; Rc = Ac01 * Ac02 * Ac03 * Ac04 * Ac05 * Ac06 * Ac07 ... * Ac08 * Ac09 * Ac10 * Ac11 * Ac12 * Ac13 * Ac14 ... * Ac15 * Ac16 * Ac17 * Ac18 * Ac19 * Ac20; % HERE %%% Rc = Ac01 * Ac02 * Ac03 * Ac04 * Ac05 * Ac06 * Ac07 ... * Ac08 * Ac09 * Ac10 * Ac11 * Ac12 * Ac13 * Ac14 ... * Ac15 * Ac16 * Ac17 * Ac18 * Ac19 * Ac20; % HERE %%% Telap_cpu = toc(t1)/2; NoRunsCPU = ceil(1.5*Tmin/Telap_cpu); else NoRunsCPU = ceil(1.5*whilecount*NoRunsCPU/Telap_cpu*Tmin); end % Warm-up for no=1:NoRunsCPU Rc = Ac01 * Ac02 * Ac03 * Ac04 * Ac05 * Ac06 * Ac07 ... * Ac08 * Ac09 * Ac10 * Ac11 * Ac12 * Ac13 * Ac14 ... * Ac15 * Ac16 * Ac17 * Ac18 * Ac19 * Ac20; % HERE %%% end % Benchmark tstart1 = tic; for no=1:NoRunsCPU Rc = Ac01 * Ac02 * Ac03 * Ac04 * Ac05 * Ac06 * Ac07 ... * Ac08 * Ac09 * Ac10 * Ac11 * Ac12 * Ac13 * Ac14 ... * Ac15 * Ac16 * Ac17 * Ac18 * Ac19 * Ac20; % HERE %%% end Telap_cpu = toc(tstart1); end % Determine time for CPU loop alone RPT = min(5E3,ceil(MaxAvg/NoRunsCPU)); tstart = tic; for AvgNo=1:RPT for no=1:NoRunsCPU end end T_CPU_Loop = toc(tstart)/RPT; % Compute CPU times T_CPU(ii) = max((Telap_cpu-T_CPU_Loop)/NoRunsCPU,2.5E-10); T_CPU_tot(ii) = Telap_cpu; fprintf(' | T_CPU: %6.1f,', T_CPU_tot(ii)); GFlops_cpu(ii) = (19*N^2*(2*N-1))/(T_CPU(ii)*1E9); fprintf(' %7.1f [GFlops]', GFlops_cpu(ii)); % CPU test end -------------------------------------------------- % GPU test begin -------------------------------------------------- Ag01 = gsingle(Ac01); Ag02 = gsingle(Ac02); Ag03 = gsingle(Ac03); Ag04 = gsingle(Ac04); Ag05 = gsingle(Ac05); Ag06 = gsingle(Ac06); Ag07 = gsingle(Ac07); Ag08 = gsingle(Ac08); Ag09 = gsingle(Ac09); Ag10 = gsingle(Ac10); Ag11 = gsingle(Ac11); Ag12 = gsingle(Ac12); Ag13 = gsingle(Ac13); Ag14 = gsingle(Ac14); Ag15 = gsingle(Ac15); Ag16 = gsingle(Ac16); Ag17 = gsingle(Ac17); Ag18 = gsingle(Ac18); Ag19 = gsingle(Ac19); Ag20 = gsingle(Ac20); geval(Ag01, Ag02, Ag03, Ag04, Ag05, Ag06, Ag07, Ag08, Ag09, Ag10, ... Ag11, Ag12, Ag13, Ag14, Ag15, Ag16, Ag17, Ag18, Ag19, Ag20); whilecount = 0; Telap_gpu = -1; while Telap_gpu < Tmin whilecount = whilecount + 1; if Telap_gpu == -1 gsync; t1 = tic; Rg = Ag01 * Ag02 * Ag03 * Ag04 * Ag05 * Ag06 * Ag07 ... * Ag08 * Ag09 * Ag10 * Ag11 * Ag12 * Ag13 * Ag14 ... * Ag15 * Ag16 * Ag17 * Ag18 * Ag19 * Ag20; % HERE %%% geval(Rg); Rg = Ag01 * Ag02 * Ag03 * Ag04 * Ag05 * Ag06 * Ag07 ... * Ag08 * Ag09 * Ag10 * Ag11 * Ag12 * Ag13 * Ag14 ... * Ag15 * Ag16 * Ag17 * Ag18 * Ag19 * Ag20; % HERE %%% geval(Rg); gsync; Telap_gpu = toc(t1)/2; NoRunsGPU = ceil(1.5*Tmin/Telap_gpu); else NoRunsGPU = ceil(1.5*whilecount*NoRunsGPU/Telap_gpu*Tmin); end % Warm-up gsync; for no=1:NoRunsGPU Rg = Ag01 * Ag02 * Ag03 * Ag04 * Ag05 * Ag06 * Ag07 ... * Ag08 * Ag09 * Ag10 * Ag11 * Ag12 * Ag13 * Ag14 ... * Ag15 * Ag16 * Ag17 * Ag18 * Ag19 * Ag20; % HERE %%% geval(Rg); end % Benchmark gsync; tstart1 = tic; for no=1:NoRunsGPU Rg = Ag01 * Ag02 * Ag03 * Ag04 * Ag05 * Ag06 * Ag07 ... * Ag08 * Ag09 * Ag10 * Ag11 * Ag12 * Ag13 * Ag14 ... * Ag15 * Ag16 * Ag17 * Ag18 * Ag19 * Ag20; % HERE %%% geval(Rg); end gsync; Telap_gpu = toc(tstart1); end % Determine time for GPU loop alone RPT = min(5E3,ceil(MaxAvg/NoRunsGPU)); tstart = tic; for AvgNo=1:RPT for no=1:NoRunsGPU end end T_GPU_Loop = toc(tstart)/RPT; % Compute GPU times T_GPU(ii) = max((Telap_gpu-T_GPU_Loop)/NoRunsGPU,2.5E-10); T_GPU_tot(ii) = Telap_gpu; fprintf(' | T_GPU: %6.1f,', T_GPU_tot(ii)); GFlops_gpu(ii) = (19*N^2*(2*N-1))/(T_GPU(ii)*1E9); fprintf(' %7.1f [GFlops]', GFlops_gpu(ii)); gpu_info = gpu_entry(13); Mem_MB(ii) = gpu_info.gpu_free/1E6; clear gpu_hook; fprintf(' | Mem free [MB]: %6.1f', Mem_MB(ii)); % GPU test end -------------------------------------------------- % Print *** as a warning for simulation time violation % (should not be possible unless something spookey is going on) if T_CPU_tot(ii)>=Tmin && T_GPU_tot(ii)>=Tmin fprintf('\n'); else fprintf(' ***\n'); end % Save data and plot for every 10 data points if ii/10==floor(ii/10) save([ Dname '/' Fname '.mat'], 'Size', ... 'T_CPU', 'T_CPU_tot', 'GFlops_cpu', ... 'T_GPU', 'T_GPU_tot', 'GFlops_gpu'); figure(1); clf(1); plot((19*Size(1:ii).^2.*(2*Size(1:ii)-1))/1E9, GFlops_cpu(1:ii), 'r-', ... (19*Size(1:ii).^2.*(2*Size(1:ii)-1))/1E9, GFlops_gpu(1:ii), 'g-', ... 'Linewidth',1.5); grid; xlabel('Complexity [GFlop]'); ylabel('Performance [GFlops]'); legend('CPU', 'GPU', 'Location', 'SouthEast'); title(['Mx20: ' TitleStr]); % Save figure print( gcf, '-djpeg99', '-r100', [ Dname '/' Fname '.jpg'] ); print( gcf, '-depsc2', '-r2400', [ Dname '/' Fname '.eps'] ); end end end