simd performance now uses aligned memory

This commit is contained in:
Mario Mulansky 2014-11-12 11:07:52 +01:00
parent da5388eb87
commit 430fe8183b
3 changed files with 34 additions and 11 deletions

View File

@ -10,6 +10,9 @@
# you also need NT2s SIMD libary available set the include path here:
# SIMD_INCLUDE = /path/to/simd/include
INCLUDES += -I$(BOOST_ROOT) -I${SIMD_INCLUDE}
# INTEL COMPILER
# change this if you want to cross-compile
ARCH = Host
# ARCH = AVX
@ -17,6 +20,14 @@ ARCH = Host
CXX = icpc
CC = icpc
INCLUDES += -I../../include/ -I$(BOOST_ROOT) -I${SIMD_INCLUDE}
CXXFLAGS = -Ofast -x${ARCH} -fno-alias -ip -inline-forceinline -std=c++0x -DNDEBUG ${INCLUDES}
# GCC COMPILER
# change this if you want to cross-compile
# ARCH = native
# # ARCH = core-avx-i
# CXX = g++
# CC = g++
# CXXFLAGS = -O3 -ffast-math -mtune=${ARCH} -march=${ARCH} -std=c++0x ${INCLUDES}

View File

@ -21,19 +21,23 @@
#include <boost/numeric/odeint.hpp>
#include <boost/simd/sdk/simd/pack.hpp>
#include <boost/simd/sdk/simd/io.hpp>
#include <boost/simd/memory/allocator.hpp>
#include <boost/simd/include/functions/splat.hpp>
#include <boost/simd/include/functions/plus.hpp>
#include <boost/simd/include/functions/multiplies.hpp>
namespace odeint = boost::numeric::odeint;
namespace simd = boost::simd;
typedef boost::timer timer_type;
static const size_t dim = 3; // roessler is 3D
typedef boost::simd::pack<double> simd_pack;
typedef simd::pack<double> simd_pack;
typedef boost::array<simd_pack, dim> state_type;
typedef std::vector<state_type> state_vec;
// use the simd allocator to get properly aligned memory
typedef std::vector< state_type, simd::allocator< state_type > > state_vec;
static const size_t pack_size = simd_pack::static_size;

View File

@ -28,20 +28,28 @@ t_intel = [get_runtime_from_file("perf_workbook/odeint_rk4_array_intel.perf"),
get_runtime_from_file("perf_ariel/odeint_rk4_array_intel.perf"),
get_runtime_from_file("perf_lyra/odeint_rk4_array_intel.perf")]
t_gfort = [get_runtime_from_file("perf_workbook/odeint_rk4_array_gfort.perf"),
get_runtime_from_file("perf_ariel/odeint_rk4_array_gfort.perf"),
get_runtime_from_file("perf_lyra/odeint_rk4_array_gfort.perf")]
t_gfort = [get_runtime_from_file("perf_workbook/rk4_gfort.perf"),
get_runtime_from_file("perf_ariel/rk4_gfort.perf"),
get_runtime_from_file("perf_lyra/rk4_gfort.perf")]
t_c_intel = [get_runtime_from_file("perf_workbook/rk4_c_intel.perf"),
get_runtime_from_file("perf_ariel/rk4_c_intel.perf"),
get_runtime_from_file("perf_lyra/rk4_c_intel.perf")]
print t_c_intel
ind = np.arange(3) # the x locations for the groups
width = 0.2 # the width of the bars
width = 0.15 # the width of the bars
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, t_gcc, width, color='b', label="odeint gcc")
rects2 = ax.bar(ind+width, t_intel, width, color='g', label="odeint intel")
rects3 = ax.bar(ind+2*width, t_gfort, width, color='c', label="gfort")
rects3 = ax.bar(ind+2*width, t_c_intel, width, color='y', label="C intel")
rects4 = ax.bar(ind+3*width, t_gfort, width, color='c', label="gfort")
ax.axis([-width, 2.0+4*width, 0.0, 0.85])
ax.axis([-width, 2.0+5*width, 0.0, 0.85])
ax.set_ylabel('Runtime (s)')
ax.set_title('Performance for integrating the Lorenz system')
ax.set_xticks(ind + 1.5*width)
@ -51,6 +59,6 @@ ax.set_xticklabels(('Core i5-3210M\n3.1 GHz',
ax.legend(loc='upper left', prop={'size': 16})
plt.savefig("perf.pdf")
plt.savefig("perf.png")
plt.savefig("perf.png", dpi=50)
plt.show()