I'm making a model that classify the txt documents into catgsn and have this problem with my model that the accuracy is about 10% , please suggest how to make it better , for the pre processing I used BoW
from sklearn.feature_extraction.text import CountVectorizer
import joblib
import os
dataset_directories = ['dataset/sport', 'dataset/business', 'dataset/entertainment', 'dataset/food',
'dataset/technology', 'dataset/space', 'dataset/politics', 'dataset/medical',
'dataset/historical', 'dataset/graphics']
all_document_texts = []
custom_stopwords_file = 'stopword'
with open(custom_stopwords_file, 'r') as stopword_file:
custom_stopwords = stopword_file.read().splitlines() #
bow = CountVectorizer(lowercase=True, stop_words=custom_stopwords)
def remove_custom_stopwords(text):
words = text.split()
filtered_words = [word for word in words if word not in custom_stopwords]
return ' '.join(filtered_words)
for data_directory in dataset_directories:
document_texts = []
for filename in os.listdir(data_directory):
if filename.endswith('.txt'):
file_path = os.path.join(data_directory, filename)
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
document_texts.append(text)
document_texts = [remove_custom_stopwords(text) for text in document_texts]
all_document_texts.extend(document_texts)
bow_result = bow.fit_transform(all_document_texts)
joblib.dump(bow, 'bow_vectorizer.pkl')
joblib.dump(bow.vocabulary_, 'bow_vocabulary.pkl')
print('\nBag of Words (BoW) values:')
print(bow_result)
for normalizing the vector
from sklearn.preprocessing import StandardScaler
import joblib
import numpy as np
bow_vectorizer = joblib.load('bow_vectorizer.pkl')
bow_vocabulary = joblib.load('bow_vocabulary.pkl')
bow_vocabulary_array = np.array(list(bow_vocabulary.values()))
bow_vocabulary_array = bow_vocabulary_array.reshape(-1, 1)
scaler = StandardScaler()
bow_vocabulary_scaled = scaler.fit_transform(bow_vocabulary_array)
joblib.dump(bow_vocabulary_scaled, 'preprocessed_bow_vocabulary.pkl')
and finally this is the model
import os
import numpy as np
import joblib
from collections import defaultdict
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
learning_rate = 0.001
bow_vectorizer = joblib.load('preprocessed_bow_vocabulary.pkl')
dataset_directories = ['dataset/sport', 'dataset/business', 'dataset/entertainment', 'dataset/food',
'dataset/technology', 'dataset/space', 'dataset/politics', 'dataset/medical',
'dataset/historical', 'dataset/graphics']
texts = []
labels = []
label_counts = defaultdict(int)
for i, data_directory in enumerate(dataset_directories):
for filename in os.listdir(data_directory):
if filename.endswith('.txt'):
file_path = os.path.join(data_directory, filename)
try:
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
texts.append(text)
labels.append(i)
label_counts[i] += 1
except Exception as e:
print(f"Error reading file '{file_path}': {e}")
max_len = max(len(text.split()) for text in texts)
X_pad = []
for text in texts:
bow_vector = []
for word in text.split():
if word in bow_vectorizer:
bow_vector.append(bow_vectorizer[word])
else:
bow_vector.append(0)
padded_vector = pad_sequences([bow_vector], padding='post', maxlen=max_len)
X_pad.append(padded_vector)
X_pad = np.array(X_pad).squeeze()
y = np.array(labels)
print("Shape of X_pad:", X_pad.shape)
print("Number of labeled files per category:")
for label, count in label_counts.items():
print(f"Category {label}: {count} files")
print("Total labeled files:", len(texts))
if len(X_pad) != len(y):
print("Error: Number of samples in X_pad and y do not match.")
else:
print("Number of samples in X_pad and y match.")
X_train, X_val, y_train, y_val = train_test_split(X_pad, y, test_size=0.2, random_state=42)
input_dim = X_pad.shape[1]
num_words = X_pad.shape[1]
embedding_dim = 100
model = Sequential([
Embedding(input_dim=num_words, output_dim=embedding_dim),
LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True),
LSTM(64, dropout=0.2, recurrent_dropout=0.2),
Dense(64, activation='relu'),
Dropout(0.5),
Dense(len(dataset_directories), activation='softmax')
])
optimizer = Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=40, batch_size=64)
accuracy = history.history['accuracy'][-1]
print("Accuracy: %.2f%%" % (accuracy * 100))
model.save('lstm_model.keras')
I want the accuracy to be at least about %60