Abstracts I am currently developing a program that uses a genetic algorithm to solve a combinatorial optimization problem. The objective is to select a control group of 2000 people from 5000 people with attribute values that are closest to the test group. However, when I run the program, I am experiencing problems with the same person being selected multiple times.
Issue Details
- I am using the code below and it is not working as expected. Sometimes certain individuals are selected more than once.
- Attempts to solve the problem: Questions to ChatGPT, consulting an engineer who is not an algorithm expert.
Goal we are trying to achieve
- We want to find a way to prevent the same person from being selected multiple times.
- We are looking for a way to solve the problem without compromising computational efficiency.
Question
- How can I prevent the same person from being selected more than once in a program using a genetic algorithm?
Code
import pandas as pd
import numpy as np
from deap import base, creator, tools, algorithms
import random
import time
# Dictionary of configuration variables
CONFIG = {
"file_path": 'C:/Python用/test1_GENIE.csv', #File path of read data
"population_size": 1000,
"generations": 100,
"cxpb": 0.5,
"mutpb": 0.2,
"n_control": 2000,
"pass_threshold": 0.05,
"output_path": 'outputpath'
}
# File path
file_path = CONFIG["file_path"]
# data load
data = pd.read_csv(file_path)
# Separate data into test and control groups
test_group = data[data['is_test'] == 1]
control_group = data[data['is_test'] == 0]
# Calculation of mean value agreement for each column
def calculate_match_rates_vectorized(test_group, control_group, columns):
# Calculate the mean of the test and control groups for each column
means_test = test_group[columns].mean()
means_control = control_group[columns].mean()
# Replace zero values with smaller numbers to avoid dividing by zero
means_control[means_control == 0] = np.finfo(float).eps
# Calculation of Match Ratio
match_rates = abs(1 - (means_test / means_control))
return match_rates
# Calculation of the evaluation function
columns = data.columns[3:]
# Genetic Algorithm Parameters
POPULATION_SIZE = CONFIG["population_size"]
GENERATIONS = CONFIG["generations"]
CXPB = CONFIG["cxpb"]
MUTPB = CONFIG["mutpb"]
N_CONTROL = CONFIG["n_control"]
# objective function
def evaluate(individual):
selected_control_group = control_group.iloc[individual]
match_rates = calculate_match_rates_vectorized(test_group, selected_control_group, columns)
max_match_rate = max(match_rates)
return (max_match_rate,)
# Genetic Algorithm Initialization
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)
toolbox = base.Toolbox()
toolbox.register("indices", random.sample, range(len(control_group)), N_CONTROL)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.indices)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutShuffleIndexes, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)
# Record current time before algorithm execution
start_time = time.time()
# Running Genetic Algorithms
def main():
random.seed(64)
pop = toolbox.population(n=POPULATION_SIZE)
hof = tools.HallOfFame(1)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("min", np.min)
stats.register("max", np.max)
pop, log = algorithms.eaSimple(pop, toolbox, cxpb=CXPB, mutpb=MUTPB, ngen=GENERATIONS,
stats=stats, halloffame=hof, verbose=True)
# Record the time after the algorithm is executed and calculate the execution time
end_time = time.time()
elapsed_time = end_time - start_time
print(f"run time: {elapsed_time} sec")
# Output the index of the best individuals of the last generation to a CSV file
best_individual = hof[0]
best_control_group = control_group.iloc[best_individual]
output_path = CONFIG["output_path"]
best_control_group.to_csv(output_path, index=False)
# Display of success message
print(f"Data has been successfully saved. Save to: {output_path}")
return pop, log, hof
if __name__ == "__main__":
pop, log, hof = main()