I'm trying to Implement a K-means algorithm, with semi-random choosing of the initial centroids. I'm using Python as a way to process the data using numpy to choose initial centers and stable API in order to implement the iterative part of K-means in C.
However, when I am entering relatively large datasets, I get Segmentation Error (core dumped), so far I tried to manage memory better and free all the Global array before go back to python, also I tried to free all local array before end of the function.
This is the code in python:
def Kmeans(K, iter , eps ,file_name_1, file_name_2):
compound_df = get_compound_df(file_name_1,file_name_2)
N , d = int(compound_df.shape[0]) , int(compound_df.shape[1])
data = np.array(pd.DataFrame.to_numpy(compound_df),dtype=float)
assert int(iter) < 1000 and int(iter) > 1 and iter.isdigit() , "Invalid maximum iteration!"
assert 1 < int(K) and int(K) < N , "Invalid number of clusters!"
PP_centers = k_means_PP(compound_df,int(K))
actual_centroids = []
for center_ind in PP_centers:
actual_centroids.append(data[center_ind])
actual_centroids = np.array(actual_centroids,dtype=float)
data = (data.ravel()).tolist()
actual_centroids = (actual_centroids.ravel()).tolist()
print(PP_centers)
print(f.fit(int(K),int(N),int(d),int(iter),float(eps),actual_centroids,data))
This is the code in C, that manages the PyObject creation, this is the python object being returned to the Kmeans function:
PyObject* convertCArrayToDoubleList(double* arr){
int i, j;
PyObject* K_centroid_list = PyList_New(K);
if(!K_centroid_list)
return NULL;
for(i=0;i<K;++i){
PyObject* current_center = PyList_New(d);
if(!K_centroid_list){
Py_DECREF(K_centroid_list);
return NULL;
}
for(j=0;j<d;++j){
PyObject* num = PyFloat_FromDouble(arr[i*d+j]);
if(!num){
Py_DECREF(K_centroid_list);
Py_DECREF(current_center);
return NULL;
}
PyList_SET_ITEM(current_center,j,num);
}
PyList_SET_ITEM(K_centroid_list,i,current_center);
}
return K_centroid_list;
}
I ran valgrind on some samples, there were some leaks of memory but I could not identify the leak.
I also tried various freeing and Py_DECREF combinations and attempt to reduce the leakage, but to no avail.