simd performance now uses aligned memory

2025-05-09 23:24:01 +00:00 · 2014-11-12 11:07:52 +01:00 · 2014-11-12 11:07:52 +01:00 · 430fe8183b
commit 430fe8183b
parent da5388eb87
3 changed files with 34 additions and 11 deletions
--- a/performance/SIMD/Makefile
+++ b/performance/SIMD/Makefile
@ -10,6 +10,9 @@
 # you also need NT2s SIMD libary available set the include path here:
 # SIMD_INCLUDE = /path/to/simd/include

+INCLUDES += -I$(BOOST_ROOT) -I${SIMD_INCLUDE}
+
+# INTEL COMPILER
 # change this if you want to cross-compile
 ARCH = Host
 # ARCH = AVX
@ -17,6 +20,14 @@ ARCH = Host

 CXX = icpc
 CC = icpc
-
-INCLUDES += -I../../include/ -I$(BOOST_ROOT) -I${SIMD_INCLUDE}
 CXXFLAGS = -Ofast -x${ARCH} -fno-alias -ip -inline-forceinline -std=c++0x -DNDEBUG ${INCLUDES}
+
+
+# GCC COMPILER
+# change this if you want to cross-compile
+# ARCH = native
+# # ARCH = core-avx-i
+
+# CXX = g++
+# CC = g++
+# CXXFLAGS = -O3 -ffast-math -mtune=${ARCH} -march=${ARCH} -std=c++0x ${INCLUDES}
--- a/performance/SIMD/roessler_simd.cpp
+++ b/performance/SIMD/roessler_simd.cpp
@ -21,19 +21,23 @@
 #include <boost/numeric/odeint.hpp>
 #include <boost/simd/sdk/simd/pack.hpp>
 #include <boost/simd/sdk/simd/io.hpp>
+#include <boost/simd/memory/allocator.hpp>
 #include <boost/simd/include/functions/splat.hpp>
 #include <boost/simd/include/functions/plus.hpp>
 #include <boost/simd/include/functions/multiplies.hpp>

+
 namespace odeint = boost::numeric::odeint;
+namespace simd = boost::simd;

 typedef boost::timer timer_type;

 static const size_t dim = 3;  // roessler is 3D

-typedef boost::simd::pack<double> simd_pack;
+typedef simd::pack<double> simd_pack;
 typedef boost::array<simd_pack, dim> state_type;
-typedef std::vector<state_type> state_vec;
+// use the simd allocator to get properly aligned memory
+typedef std::vector< state_type, simd::allocator< state_type > > state_vec;

 static const size_t pack_size = simd_pack::static_size;

--- a/performance/plot_result.py
+++ b/performance/plot_result.py
@ -28,20 +28,28 @@ t_intel = [get_runtime_from_file("perf_workbook/odeint_rk4_array_intel.perf"),
           get_runtime_from_file("perf_ariel/odeint_rk4_array_intel.perf"),
           get_runtime_from_file("perf_lyra/odeint_rk4_array_intel.perf")]

-t_gfort = [get_runtime_from_file("perf_workbook/odeint_rk4_array_gfort.perf"),
-           get_runtime_from_file("perf_ariel/odeint_rk4_array_gfort.perf"),
-           get_runtime_from_file("perf_lyra/odeint_rk4_array_gfort.perf")]
+t_gfort = [get_runtime_from_file("perf_workbook/rk4_gfort.perf"),
+           get_runtime_from_file("perf_ariel/rk4_gfort.perf"),
+           get_runtime_from_file("perf_lyra/rk4_gfort.perf")]
+
+t_c_intel = [get_runtime_from_file("perf_workbook/rk4_c_intel.perf"),
+             get_runtime_from_file("perf_ariel/rk4_c_intel.perf"),
+             get_runtime_from_file("perf_lyra/rk4_c_intel.perf")]
+
+print t_c_intel
+

 ind = np.arange(3)  # the x locations for the groups
-width = 0.2         # the width of the bars
+width = 0.15         # the width of the bars

 fig = plt.figure()
 ax = fig.add_subplot(111)
 rects1 = ax.bar(ind, t_gcc, width, color='b', label="odeint gcc")
 rects2 = ax.bar(ind+width, t_intel, width, color='g', label="odeint intel")
-rects3 = ax.bar(ind+2*width, t_gfort, width, color='c', label="gfort")
+rects3 = ax.bar(ind+2*width, t_c_intel, width, color='y', label="C intel")
+rects4 = ax.bar(ind+3*width, t_gfort, width, color='c', label="gfort")

-ax.axis([-width, 2.0+4*width, 0.0, 0.85])
+ax.axis([-width, 2.0+5*width, 0.0, 0.85])
 ax.set_ylabel('Runtime (s)')
 ax.set_title('Performance for integrating the Lorenz system')
 ax.set_xticks(ind + 1.5*width)
@ -51,6 +59,6 @@ ax.set_xticklabels(('Core i5-3210M\n3.1 GHz',
 ax.legend(loc='upper left', prop={'size': 16})

 plt.savefig("perf.pdf")
-plt.savefig("perf.png")
+plt.savefig("perf.png", dpi=50)

 plt.show()