I want to group the 80% or above similar pdf documents using K Mean Algorithm and ML.Net. I am reading the text from PDF files. My requirement is whatever similarity percentage user enters, the document files should grouped according to that percentage only which means if user entered the 70% then document should be at least 70% similar.
Also how can i get the euclidean distance for each document from centroid?
I am new to ML.Net and Algorithm Please help and guide. Thanks
public class Prediction
{
[ColumnName("PredictedLabel")]
public uint Cluster { get; set; }
[ColumnName("Score")]
public float[] Distances { get; set; }
}
public void Train(IEnumerable<TextData> data, int numberOfClusters)
{
var mlContext = new MLContext();
var textDataView = mlContext.Data.LoadFromEnumerable(data);
var textEstimator = mlContext.Transforms.Text.NormalizeText("Text")
.Append(mlContext.Transforms.Text.TokenizeIntoWords("Text"))
.Append(mlContext.Transforms.Text.RemoveDefaultStopWords("Text"))
.Append(mlContext.Transforms.Conversion.MapValueToKey("Text"))
.Append(mlContext.Transforms.Text.ProduceNgrams("Text"))
.Append(mlContext.Transforms.NormalizeLpNorm("Text"))
.Append(mlContext.Transforms.NormalizeMinMax("Text"))
.Append(mlContext.Clustering.Trainers.KMeans("Text",
numberOfClusters: numberOfClusters));
var model = textEstimator.Fit(textDataView);
_predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, Prediction>
_predictionEngine.Predict(new TextData() { Text = "19D000XXX Susan Porter 821442289
Information required"}).Cluster;
}