I assumed that fixed point having lower latency and higher energy efficience than float point. So i try to verify that in my pc. I have Ubuntu 12.04,clang version 14.0.0-1ubuntu1.1, Target: x86_64-pc-linux-gnu. I compile following c++ code with command: clang++ -O0 ShortVsInt.cpp -o ShortVsInt
But as result, i get:
used time for int 10613860 ns.
used time for short 10299391 ns.
used time for float 10453464 ns.
Here is my program
ShortVsInt.cpp:
#include <iostream>
#include <random>
#include <vector>
#include <chrono>
#define MatrixSize 10000000
#define loopSize 1
int main()
{
int* i_a = (int*) malloc(sizeof(int)*MatrixSize);
int* i_b = (int*) malloc(sizeof(int)*MatrixSize);
int* i_c = (int*) malloc(sizeof(int)*MatrixSize);
// int i_a[MatrixSize];
// int i_b[MatrixSize];
// int i_c[MatrixSize];
short* s_a=(short*) malloc(sizeof(short)*MatrixSize);
short* s_b=(short*) malloc(sizeof(short)*MatrixSize);
short* s_c=(short*) malloc(sizeof(short)*MatrixSize);
float* f_a=(float*) malloc(sizeof(float)*MatrixSize);
float* f_b=(float*) malloc(sizeof(float)*MatrixSize);
float* f_c=(float*) malloc(sizeof(float)*MatrixSize);
std::mt19937 mt_random{static_cast<u_long>(time(0))};
std::uniform_real_distribution<> dist_short(INT16_MIN, INT16_MAX);
// std::uniform_real_distribution<> dist_int(INT32_MIN, INT32_MAX);
// std::uniform_real_distribution<> dist_float(INT32_MIN, INT32_MAX);
for (int i = 0; i < MatrixSize; i++)
{
i_a[i]=static_cast<int>(dist_short(mt_random));
i_b[i]=static_cast<int>(dist_short(mt_random));
i_c[i]=static_cast<int>(0.0);
s_a[i]=static_cast<short>(dist_short(mt_random));
s_b[i]=static_cast<short>(dist_short(mt_random));
s_c[i]=static_cast<short>(0.0);
f_a[i]=static_cast<float>(dist_short(mt_random));
f_b[i]=static_cast<float>(dist_short(mt_random));
f_c[i]=static_cast<float>(0.0);
}
auto start_int = std::chrono::steady_clock::now();
for(int k=0; k<loopSize; k++)
{
for (int i = 0; i < MatrixSize; i++)
{
i_c[i]=i_a[i]*i_b[i]+i_a[i];
}
}
auto end_int = std::chrono::steady_clock::now();
auto time_int=std::chrono::duration_cast<std::chrono::nanoseconds>(end_int - start_int);
auto start_short = std::chrono::steady_clock::now();
for(int k=0; k<loopSize; k++)
{
for (int i = 0; i < MatrixSize; i++)
{
s_c[i]=s_a[i]*s_b[i]+s_a[i];
}
}
auto end_short = std::chrono::steady_clock::now();
auto time_short=std::chrono::duration_cast<std::chrono::nanoseconds>(end_short - start_short);
auto start_float = std::chrono::steady_clock::now();
for(int k=0; k<loopSize; k++)
{
for (int i = 0; i < MatrixSize; i++)
{
f_c[i]=f_a[i]*f_b[i]+f_a[i];
}
}
auto end_float = std::chrono::steady_clock::now();
auto time_float=std::chrono::duration_cast<std::chrono::nanoseconds>(end_float - start_float);
// printMatrix<float>(m_a, m_b,m_c);
std::cout << "used time for int " << time_int.count()/loopSize << " ns."<< std::endl;
std::cout << "used time for short " << time_short.count()/loopSize << " ns."<< std::endl;
std::cout << "used time for float " << time_float.count()/loopSize << " ns."<< std::endl;
return(0);
}
i try to run the program more times and it shows me alway same resualt. Can someone give me any idea why the runtime of int and short operations ist not better than float. Does my compiler or chips do some optimization in background?