-
Notifications
You must be signed in to change notification settings - Fork 1
/
generateStudents.py
237 lines (199 loc) · 8.85 KB
/
generateStudents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import openai
import json
import random
import pandas as pd
from faker import Faker
import re
from tqdm import tqdm # For the progress bar
# Initialize Faker
fake = Faker()
openai.api_key = "YOUR_API_KEY_HERE"
# Load courses data from JSON
with open('structured_courses.json', 'r') as f:
courses_data = json.load(f)
# Define the base distribution for each major
base_distribution = {
"Computer Science": 0.2,
"Engineering": 0.2,
"Business": 0.2,
"Physics": 0.05,
"Biology": 0.1,
"Chemistry": 0.05,
"Other": 0.2
}
# Define "help needed" courses by level for each major
help_courses = {
"Computer Science": {
"Freshman": ["CSE 110", "MAT 117", "CSE 180", "CSE 120"],
"Sophomore": ["CSE 205", "CSE 240", "MAT 243", "CSE 220"],
"Junior": ["CSE 310", "CSE 360", "CSE 365", "CSE 340"],
"Senior": ["CSE 410", "CSE 445", "CSE 450", "CSE 460"]
},
"Engineering": {
"Freshman": ["FSE 100", "MAE 102", "EEE 120", "MAE 111"],
"Sophomore": ["MAE 213", "EEE 202", "EGR 220", "MAE 212"],
"Junior": ["MAE 241", "EEE 330", "MAE 340", "EGR 335"],
"Senior": ["MAE 400", "EEE 452", "MAE 441", "MAE 460"]
},
"Business": {
"Freshman": ["WPC 101", "COM 100", "ECN 110", "ACC 201"],
"Sophomore": ["ECN 211", "WPC 247", "COM 230", "ACC 212"],
"Junior": ["WPC 347", "FIN 300", "MGT 300", "MKT 302"],
"Senior": ["MGT 400", "WPC 451", "FIN 402", "MKT 450"]
},
"Physics": {
"Freshman": ["PHY 121", "PHY 131", "MAT 170", "PHY 100"],
"Sophomore": ["PHY 202", "PHY 252", "PHY 201", "MAT 210"],
"Junior": ["PHY 314", "PHY 301", "PHY 360", "PHY 342"],
"Senior": ["PHY 452", "PHY 462", "PHY 498", "PHY 480"]
},
"Biology": {
"Freshman": ["BIO 100", "BIO 160", "CHM 101", "MAT 117"],
"Sophomore": ["BIO 182", "BIO 202", "MIC 205", "BIO 230"],
"Junior": ["BIO 320", "BIO 340", "BIO 351", "BIO 370"],
"Senior": ["BIO 420", "BIO 440", "BIO 495", "BIO 480"]
},
"Chemistry": {
"Freshman": ["CHM 113", "CHM 117", "MAT 170", "CHM 100"],
"Sophomore": ["CHM 233", "CHM 234", "CHM 231", "CHM 235"],
"Junior": ["CHM 453", "CHM 460", "CHM 341", "CHM 361"],
"Senior": ["CHM 480", "CHM 495", "CHM 498", "CHM 482"]
},
"Other": {
"Freshman": ["ASU 101", "ENG 101", "COM 100", "MAT 117"],
"Sophomore": ["COM 263", "ENG 102", "HST 110", "MAT 210"],
"Junior": ["MAT 265", "ENG 301", "SOC 300", "COM 310"],
"Senior": ["ENG 410", "SOC 400", "HST 495", "PHI 498"]
}
}
# Define the course ranges by year
course_ranges = {
"Freshman": (4, 8),
"Sophomore": (9, 15),
"Junior": (16, 24),
"Senior": (25, 30)
}
def apply_random_deviation(distribution, deviation=0.02):
"""Apply random ±2% deviation to each major's distribution."""
adjusted_distribution = {}
total_weight = 0
for major, weight in distribution.items():
adjustment = random.uniform(-deviation, deviation)
new_weight = max(0, weight + adjustment)
adjusted_distribution[major] = new_weight
total_weight += new_weight
normalized_distribution = {k: v / total_weight for k, v in adjusted_distribution.items()}
return normalized_distribution
def generate_major(distribution):
"""Randomly select a major based on the given distribution."""
return random.choices(
population=list(distribution.keys()),
weights=list(distribution.values()),
k=1
)[0]
def filter_courses_by_year(major_courses, year):
"""Filter courses based on the student's year level."""
# Define the year ranges based on the course level (e.g., 100-199 for Freshman)
year_ranges = {
"Freshman": range(100, 200),
"Sophomore": range(200, 300),
"Junior": range(300, 400),
"Senior": range(400, 500)
}
# Extract the course level using regex (e.g., "CSE 110" -> 110)
def get_course_level(course):
match = re.search(r'(\d{3})', course)
return int(match.group(1)) if match else None
# Filter courses based on the year level
return [
course for course in major_courses
if get_course_level(course) in year_ranges.get(year, range(100, 500))
]
import re
import random
def filter_courses_by_level(major_courses, level_range):
"""Filter courses based on the level range (e.g., 100-199)."""
def get_course_level(course):
match = re.search(r'(\d{3})', course)
return int(match.group(1)) if match else None
return [course for course in major_courses if get_course_level(course) in level_range]
def distribute_courses_across_levels(filtered_courses, year):
"""Distribute courses across levels according to the student's year."""
course_distribution = {
"Freshman": [(100, 200)],
"Sophomore": [(100, 200), (200, 300)],
"Junior": [(100, 200), (200, 300), (300, 400)],
"Senior": [(100, 200), (200, 300), (300, 400), (400, 500)]
}
levels = course_distribution[year]
selected_courses = []
# Add courses from each level
for level_range in levels:
level_courses = filter_courses_by_level(filtered_courses, range(*level_range))
if level_courses:
selected_courses.extend(random.sample(level_courses, min(3, len(level_courses))))
return selected_courses
def generate_about_me():
"""Generate a short 'About Me' section using OpenAI API without names or gender pronouns."""
messages = [
{"role": "system", "content": "You are a helpful assistant creating a short bio for a student."},
{"role": "user", "content": "Write a brief 1-3 line description of a student's hobbies and study method, avoiding any names or gender pronouns. Use a casual, friendly tone. Use a good amount of diversity for the activities. For the study method, use one of these: by using flashcards for quick recall, by creating detailed outlines of topics, with interactive quizzes and practice problems, by making digital notes with highlights, by summarizing topics into concise notes, with spaced repetition techniques, by explaining topics out loud to reinforce learning."}
]
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages,
max_tokens=500,
temperature=0.9
)
return response['choices'][0]['message']['content'].strip()
def generate_profile(major_distribution):
"""Generate a random student profile with name-sex consistency."""
# First, assign the sex
sex = random.choice(["Male", "Female"])
# Generate the first name based on the assigned sex
if sex == "Male":
first_name = fake.first_name_male()
else:
first_name = fake.first_name_female()
last_name = fake.last_name()
year = random.choice(list(course_ranges.keys())) # Select the student's year
major = generate_major(major_distribution)
# Select the number of courses based on the student's year
min_courses, max_courses = course_ranges[year]
# Get the courses for the selected major, or fallback to "Other" courses
major_courses = courses_data.get(major, courses_data.get("Other", []))
# Distribute courses across levels based on the student's year
courses_taken = distribute_courses_across_levels(major_courses, year)
# If the number of courses is insufficient, add fallback courses
fallback_courses = ["ASU 101", "ENG 102", "COM 263", "MAT 170"]
while len(courses_taken) < min_courses:
courses_taken.extend(fallback_courses)
# Ensure the final course list matches the required number of courses
num_courses = min(random.randint(min_courses, max_courses), len(courses_taken))
courses_taken = random.sample(courses_taken, num_courses)
# Assign a help course (70% chance) based on the major and year
if random.random() < 0.7 and major in help_courses:
help_class_options = help_courses[major].get(year, [])
course_need_help = random.choice(help_class_options) if help_class_options else random.choice(courses_taken)
else:
course_need_help = random.choice(courses_taken)
about_me = generate_about_me()
# Create the profile dictionary
profile = {
"Name": f"{first_name} {last_name}",
"Year": year,
"Major": major,
"Sex": sex,
"Courses_Taken": courses_taken,
"Course_Need_Help": course_need_help,
"About me" : about_me
}
return profile
# Apply random deviation to the major distribution
adjusted_distribution = apply_random_deviation(base_distribution)
# Generate 1000 profiles with a progress bar
profiles = [generate_profile(adjusted_distribution) for _ in tqdm(range(5000), desc="Generating Profiles", unit="profile")]
# Save the profiles to a CSV file
df = pd.DataFrame(profiles)
df.to_csv('student_profiles.csv', index=False)
print("Generated 5000 student profiles and saved to student_profiles.csv")