-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp_dbscan.py
More file actions
138 lines (115 loc) · 5.18 KB
/
app_dbscan.py
File metadata and controls
138 lines (115 loc) · 5.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
from flask import Flask, request, jsonify
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
import logging
from dotenv import load_dotenv
nltk.download('punkt')
nltk.download('stopwords')
# Load environment variables from .env file
load_dotenv()
# Configure logging
logging.basicConfig(level=logging.INFO)
app = Flask(__name__)
# Text Preprocessing Class
class TextPreprocessor:
def __init__(self):
self.stemmer = PorterStemmer()
self.stop_words = set(stopwords.words('english'))
def preprocess(self, text):
tokens = nltk.word_tokenize(text)
tokens = [word.lower() for word in tokens]
tokens = [word for word in tokens if word not in self.stop_words]
tokens = [self.stemmer.stem(word) for word in tokens]
return ' '.join(tokens)
# Load and preprocess the data
def load_dataset(file_path):
logging.info("Loading dataset from %s", file_path)
df = pd.read_csv(file_path)
job_descriptions = df['Job Description'].dropna()
preprocessor = TextPreprocessor()
processed_descriptions = job_descriptions.apply(preprocessor.preprocess)
return processed_descriptions
def get_top_keywords(vectorizer, model, num_keywords=10):
feature_names = vectorizer.get_feature_names_out()
top_keywords = {}
for cluster_label in set(model.labels_):
if cluster_label == -1:
continue
indices = [i for i, label in enumerate(model.labels_) if label == cluster_label]
centroid = model.components_[cluster_label]
words = [feature_names[idx] for idx in centroid.argsort()[-num_keywords:]]
top_keywords[cluster_label] = words
return top_keywords
# Load data and train the model
def initialize_model(dataset_path, max_features=1000, epsilon=0.5, min_samples=5):
job_descriptions = load_dataset(dataset_path)
vectorizer = TfidfVectorizer(max_features=max_features)
X = vectorizer.fit_transform(job_descriptions).toarray()
model = DBSCAN(eps=epsilon, min_samples=min_samples, metric='cosine')
model.fit(X)
keywords = get_top_keywords(vectorizer, model)
return vectorizer, model, keywords
# Map cluster labels to departments
def map_clusters_to_departments(top_keywords):
cluster_department_mapping = {}
for cluster, keywords in top_keywords.items():
if any(word in keywords for word in ["engineer", "developer", "software"]):
cluster_department_mapping[cluster] = "Engineering"
elif any(word in keywords for word in ["support", "help", "service"]):
cluster_department_mapping[cluster] = "Support"
elif any(word in keywords for word in ["data", "science", "analysis"]):
cluster_department_mapping[cluster] = "Data Science"
elif any(word in keywords for word in ["marketing", "brand", "campaign"]):
cluster_department_mapping[cluster] = "Marketing"
else:
cluster_department_mapping[cluster] = "Miscellaneous"
return cluster_department_mapping
# Load dataset path and parameters from environment variables
dataset_path = os.getenv("DATASET_PATH", "./data/Booking_Jobs_All_220218.csv")
tfidf_max_features = int(os.getenv("TFIDF_MAX_FEATURES", 1000))
dbscan_eps = float(os.getenv("DBSCAN_EPSILON", 0.5))
dbscan_min_samples = int(os.getenv("DBSCAN_MIN_SAMPLES", 5))
# Initialize the model
vectorizer, model, top_keywords = initialize_model(
dataset_path,
max_features=tfidf_max_features,
epsilon=dbscan_eps,
min_samples=dbscan_min_samples
)
# Create cluster to department mapping
cluster_department_mapping = map_clusters_to_departments(top_keywords)
@app.route('/predict', methods=['POST'])
def predict():
try:
data = request.json
logging.info("Received prediction request with data: %s", data)
job_description = data.get('job_description')
if not job_description:
logging.error("Job description not provided in the request data")
return jsonify({'error': 'Job description not provided'}), 400
preprocessor = TextPreprocessor()
preprocessed_text = preprocessor.preprocess(job_description)
X_new = vectorizer.transform([preprocessed_text]).toarray()
cluster = int(model.fit_predict(X_new)[0]) # Convert to int
if cluster == -1:
logging.warning("Job description classified as noise")
return jsonify({'cluster': cluster, 'error': 'This data point is considered noise'}), 400
department = cluster_department_mapping.get(cluster, "Unknown")
logging.info("Cluster: %s, Department: %s, Keywords: %s", cluster, department, top_keywords.get(cluster, []))
response = {
'cluster': cluster,
'department': department,
'keywords': top_keywords.get(cluster, [])
}
return jsonify(response)
except Exception as e:
logging.exception("Exception occurred during prediction")
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
port = int(os.getenv("FLASK_PORT", 6000))
app.run(debug=True, port=port)