From 8fb1268780fced5cf772737d3eb071abf8fb5323 Mon Sep 17 00:00:00 2001
From: Anoushka Deshmukh <anoushka@vt.edu>
Date: Thu, 23 Feb 2023 13:23:31 +0000
Subject: [PATCH] script to translate raw data to sessions

---
 abstracting_script.py | 328 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 328 insertions(+)
 create mode 100644 abstracting_script.py

diff --git a/abstracting_script.py b/abstracting_script.py
new file mode 100644
index 0000000..562dd9b
--- /dev/null
+++ b/abstracting_script.py
@@ -0,0 +1,328 @@
+import os
+import pandas as pd
+import csv
+import re
+from datetime import datetime as dt, timedelta
+from collections import defaultdict
+import time
+
+# Initializing some variables
+avg_session_time = 0
+total_session_count = 0
+total_session_time = 0
+
+document_event = ['document-ready']
+window_event = ['window-unload', 'window-blur', 'window-focus', ]
+pe_event = ['jsav-matrix-click', 'jsav-exercise-grade', 'jsav-exercise-reset', 'jsav-node-click', 
+                'button-identifybutton', 'button-editbutton', 'button-addrowbutton', 'button-deletebutton', 'button-setterminalbutton', 'button-addchildbutton',
+                'button-checkbutton', 'button-autobutton', 'button-donebutton',
+                'submit-helpbutton', 'submit-edgeButton', 'submit-deleteButton', 'submit-undoButton', 'submit-redoButton', 'submit-editButton', 'submit-nodeButton',
+                'submit-begin', 'submit-finish', 'button-hintbutton', 'button-movebutton', 'button-removetreenodebutton', 'button-savefile', 'button-edgebutton', 
+                'jsav-exercise-model-end', 'jsav-exercise-model-begin', 'jsav-array-click',
+                'jsav-exercise-gradeable-step', 'jsav-exercise-grade',
+                'jsav-exercise-model-open', 'jsav-exercise-model-forward',
+                'jsav-exercise-model-close', 'jsav-node-click',
+                'jsav-exercise-grade-change', 'jsav-exercise-reset',
+                'jsav-exercise-step-fixed', 'jsav-arraytree-click',
+                'jsav-exercise-undo', 'jsav-exercise-model-backward',
+                'jsav-exercise-model-begin', 'jsav-exercise-step-undone',
+                'jsav-exercise-model-end', 'odsa-award-credit', 'odsa-exercise-init', 
+                'button-classify', 'button-throwRoll', 'button-calculate',
+                'button-decrement', 'button-help', 'button-selecting',
+                'button-sorting', 'button-incrementing', 'button-run',
+                'button-partition', 'button-markSorted', 'button-reset',
+                'button-outputbuffer', 'button-noaction', 'button-submit',
+                'button-insert', 'button-remove', 'button-next', 'button-about',
+                'button-undir', 'button-dir', 'button-clear', 'button-read',
+                'button-write', 'button-restart']
+ff_event = ['jsav-begin', 'jsav-end', 'jsav-forward', 'jsav-backward']
+other_event = ['hyperlink', 'jsav-narration-on', 'jsav-narration-off', 'button-layoutRef', 'odsa-exercise-init']
+
+# Reads a raw interaction data file
+def readfile(file_name):
+    os.chdir("./data/" + file_name)
+    print("Reading " + file_name + " data")
+    df = pd.read_csv(file_name + "_sorted.csv")
+    global csvdata
+    csvdata = df.sort_values(['user_id', 'action_time'])
+    print(df.iloc[1])
+
+# convert time in a specific format to display
+def GetTime(seconds):
+    sec = timedelta(seconds=int(seconds))
+    d = dt(1,1,1) + sec
+    return "%d days %d hours %d minutes %d seconds" % (d.day-1, d.hour, d.minute, d.second)
+
+# helper function to write a name of the event
+def writeEvName(row):
+    event = row['name']
+    if event in document_event:
+        return "document event"
+    elif event in window_event:
+        return "window event"
+    elif event in ff_event:
+        return "FF event"
+    elif event in pe_event:
+        return "PE event"
+    else:
+        return "Other event"
+
+# Helper function to write a description for events
+def writeDesc(row):    
+    # exercise_type
+    if row['ex_type'] == "pe":
+        return "Attempted to solve PE"
+    elif pd.isnull(row['inst_section_id']):
+        return row['description']
+    else:
+        # ev_name
+        if row['short_name']:
+            return f'Attempted to solve {row[9]} frame '
+        else:
+            return f'Attempted to solve {row[9]} exercise'
+
+# Helper function to write a time for events
+def writeTime(row, start, end):
+    if row['description'] == "PE" or row['ex_type'] == "pe" or check_pe_helper(row['name']):
+        return f'{(end - start).total_seconds()} seconds'
+    elif row['name'] in ff_event:
+        return f'{(end-start).total_seconds()} seconds'
+    elif "document" not in row['name']:
+        # ev_name
+        if row['short_name']:
+            return f'In slideshow for {(end - start).total_seconds()} seconds' if (end - start).total_seconds() > 0 else None
+        else:
+            return f'In exercise for {(end - start).total_seconds()} seconds' if (end - start).total_seconds() > 0 else None
+
+# Check whether the event is associated with PE
+def check_pe_helper(command):
+    if command in pe_event:
+        return True
+    else:
+        return False
+
+# Check whether this and next events are the same type of event (PE)
+def bundle_pe(curr, next):
+    if not pd.isnull(curr['ex_type']) or curr['description'] == 'PE': # curr has value
+        if not pd.isnull(next['ex_type']): # both curr and next has values
+            if curr['ex_type'] != 'pe' and next['ex_type'] != 'pe':
+                return False
+        else: #if only curr has value
+            if not check_pe_helper(next['name']) and next['description'] != 'PE':
+                return False 
+    else: # curr doesn't have value
+        if check_pe_helper(curr['name']):
+            if next['ex_type'] == 'pe' or check_pe_helper(next['name']):
+                return True
+            else:
+                return False
+        else:
+            return False
+    return True
+ 
+# Check whether this and next events are the same type of event (FF)
+def bundle_ff(curr, next):
+    if not pd.isnull(curr['short_name']) and curr['short_name'] == next['short_name']:
+        return True
+    else:
+        return False
+
+# Main function to abstract a raw event data into a session data
+def abstract(file_name):
+    readfile(file_name)
+    print("Reading complete")
+    
+    global avg_session_time
+    global total_session_count
+    global total_session_time
+
+
+    session_count = 0 
+    session_start_time = csvdata.iloc[0]['action_time'] # Initially set up the start time as the first event's action time
+    session_end_time = 0
+    columns = ["session", "user ID", "Inst Book", "Event name", "Event Description", "Start time", "End Time", "Action Time", "Exercise Type", "Number of events"]
+    start_time = 0
+    num_event = 1
+
+    is_pe_exercise = False
+    is_ff_exercise = False
+
+    # Write a csv "file_name"_merged_result_unannotated.csv file
+    with open(file_name + '_merged_result_unannotated.csv', 'w', newline="") as csv_file:
+        writer = csv.writer(csv_file)
+        writer.writerow(columns)
+        session_count += 1
+
+        row_iterator = csvdata.iterrows()
+        _, curr = next(row_iterator)
+        for i, row in row_iterator:
+            
+            # Set the threshold for the session
+            threshold = 600
+
+        # Example of sample raw interaction data
+        # id	    user_id	    inst_book_id	name	        description	            action_time	        inst_chapter_module_id	inst_section_id	    inst_exercise_id	short_name	    ex_type
+        # 0         1           2               3               4                       5                   6                       7                   8                   9               10          
+        # 22961032	8387	    722	            document-ready	"User loaded module"	2020-01-21 18:43	70590				
+        # 23119732	3013	    722	            jsav-node-click	{"ev_num":43}	        2020-01-29 2:41	    70620	                109676	            1095	            sheet1exercise3	pe
+
+            user_id = curr['user_id']
+            book_id = curr['inst_book_id']
+            ev_name = curr['name'] 
+            ev_desc = curr['description']
+            action_time = curr['action_time']
+            module_id = curr['inst_chapter_module_id']
+            section_id = curr['inst_section_id']
+            exercise_id = curr['inst_exercise_id']
+            exercise_name = curr['short_name']
+            exercise_type = curr['ex_type']
+            next_ev = row
+
+            now = dt.strptime(action_time, "%Y-%m-%d %H:%M:%S")
+            next_time = dt.strptime(next_ev['action_time'], "%Y-%m-%d %H:%M:%S")
+            time_diff = (next_time - now).total_seconds()
+
+            end_time = start_time
+
+            if start_time == 0:
+                start_time = action_time
+
+            if session_count == 0:
+                # writer.writerow("\n")
+                # writer.writerow([f'Session {session_count + 1}'])
+                session_count += 1
+                session_start_time = action_time
+                total_session_count = total_session_count + 1
+
+            if user_id == next_ev['user_id']:
+                if (time_diff > threshold): # Creating a new session
+                    # For the last event of a session, we need to add one more row of event at the end of the session
+                    end_time = action_time
+                    start = dt.strptime(start_time, "%Y-%m-%d %H:%M:%S")
+                    end = dt.strptime(end_time, "%Y-%m-%d %H:%M:%S")
+                    diff = writeTime(curr, start, end)
+                    # diff = f'{(end - start).total_seconds()} seconds' if (end - start).total_seconds() > 0 else None 
+                    if is_pe_exercise:
+                        writer.writerow([session_count, user_id, book_id, writeEvName(curr), "Attempted to solve PE", start_time, end_time, diff, exercise_name, num_event])
+                    elif is_ff_exercise:
+                        writer.writerow([session_count, user_id, book_id, writeEvName(curr), writeDesc(curr), start_time, end_time, diff, exercise_name, num_event])
+                    else:
+                        writer.writerow([session_count, user_id, book_id, writeEvName(curr), writeDesc(curr), start_time, end_time, diff, exercise_name, num_event])
+
+                    # Write a comprehensive information about a session
+                    # Uncomment these lines if you want to see an annotated version of abstracted data
+                    session_end_time = end
+                    # writer.writerow([f'User inactive for {GetTime(time_diff)}'])
+                    # writer.writerow([f'Session lasted for {session_end_time - dt.strptime(session_start_time, "%Y-%m-%d %H:%M:%S")}'])
+                    # writer.writerow("\n")
+                    # writer.writerow([f'Session {session_count + 1}'])
+                    
+
+                    total_session_time = total_session_time + (session_end_time - dt.strptime(session_start_time, "%Y-%m-%d %H:%M:%S")).total_seconds()
+                    total_session_count = total_session_count + 1
+
+                    session_count += 1
+                    session_start_time = next_ev['action_time']
+                    start_time = 0
+                    num_event = 1
+                    is_pe_exercise = False
+                    is_ff_exercise = False
+
+                else: # Retreive all events in one session
+                    if user_id == next_ev['user_id']:
+                        if ev_name == next_ev['name']: # Finds duplicate events within the same session
+                            end_time = action_time
+                            num_event += 1
+                            curr = row
+                            continue
+                        else:
+                            if bundle_pe(curr, next_ev):
+                                end_time = action_time
+                                num_event += 1
+                                is_pe_exercise = True
+                                curr = row
+                                continue
+                            elif bundle_ff(curr, next_ev):
+                                end_time = action_time
+                                num_event += 1
+                                is_ff_exercise = True
+                                curr = row
+                                continue
+                            else:
+                                end_time = action_time                        
+                                start = dt.strptime(start_time, "%Y-%m-%d %H:%M:%S")
+                                end = dt.strptime(end_time, "%Y-%m-%d %H:%M:%S")
+                                diff = writeTime(curr, start, end)
+                                # diff = f'In slideshow for {(end - start).total_seconds()} seconds' if (end - start).total_seconds() > 0 else None 
+
+                                if is_pe_exercise:
+                                    writer.writerow([session_count, user_id, book_id, writeEvName(curr), "Attempted to solve PE", start_time, end_time, diff, exercise_name, num_event])
+                                elif ev_name == 'window-focus':
+                                    # if next_ev['name'] == 'window-blur':
+                                        # if (next_time - now).total_seconds() > 3:
+                                    writer.writerow([session_count, user_id, book_id, "Window open", writeDesc(curr), start_time, end_time, f'Reading time: {(next_time - now).total_seconds()} sec', exercise_name, num_event])
+
+                                elif ev_name == 'window-blur' and next_ev['name'] == 'window-focus':
+                                    # if (next_time - now).total_seconds() > 3:
+                                    writer.writerow([session_count, user_id, book_id, "Window close", writeDesc(curr), start_time, end_time, f'Away time: {(next_time - now).total_seconds()} sec', exercise_name, num_event])
+                                elif is_ff_exercise:
+                                    writer.writerow([session_count, user_id, book_id, writeEvName(curr), writeDesc(curr), start_time, end_time, diff, exercise_name, num_event])
+                                else:
+                                    writer.writerow([session_count, user_id, book_id, writeEvName(curr), writeDesc(curr), start_time, end_time, diff, exercise_name, num_event])
+                                start_time = 0
+                                num_event = 1
+                                is_pe_exercise = False
+                                is_ff_exercise = False
+                    else:
+                        start_time = 0
+                        session_count = 0
+                        num_event = 1
+
+            else: # Starts a new session for new student
+                # For the last event of a session, we need to add one more row of event at the end of the session
+                end_time = action_time                        
+                start = dt.strptime(start_time, "%Y-%m-%d %H:%M:%S")
+                end = dt.strptime(end_time, "%Y-%m-%d %H:%M:%S")
+                diff = writeTime(curr, start, end)
+                # diff = f'{(end - start).total_seconds()} seconds' if (end - start).total_seconds() > 0 else None 
+                if is_pe_exercise:
+                    writer.writerow([session_count, user_id, book_id, writeEvName(curr), "Attempted to solve PE", start_time, end_time, diff, exercise_name, num_event])
+                elif ev_name == 'window-focus' and next_ev['name'] == 'window-blur':
+                    # if (next_time - now).total_seconds() > 3:
+                    writer.writerow([session_count, user_id, book_id, "Window open", writeDesc(curr), start_time, end_time, f'Reading time: {(next_time - now).total_seconds()} sec', exercise_name, num_event])
+                elif ev_name == 'window-blur' and next_ev['name'] == 'window-focus':
+                    # if (next_time - now).total_seconds() > 3:
+                    writer.writerow([session_count, user_id, book_id, "Window close", writeDesc(curr), start_time, end_time, f'Away time: {(next_time - now).total_seconds()} sec', exercise_name, num_event])
+                elif is_ff_exercise:
+                    writer.writerow([session_count, user_id, book_id, writeEvName(curr), writeDesc(curr), start_time, end_time, diff, exercise_name, num_event])
+                else:
+                    writer.writerow([session_count, user_id, book_id, writeEvName(curr), writeDesc(curr), start_time, end_time, diff, exercise_name, num_event])
+                
+                # Write a comprehensive information about a session
+                start_time = 0
+                session_count = 0
+                num_event = 1
+
+                session_end_time = end
+                # writer.writerow([f'Session lasted for {session_end_time - dt.strptime(session_start_time, "%Y-%m-%d %H:%M:%S")}'])
+                # writer.writerow([f'All User Session Ended for User: {user_id}'])
+                total_session_time = total_session_time + (session_end_time - dt.strptime(session_start_time, "%Y-%m-%d %H:%M:%S")).total_seconds()
+
+                is_pe_exercise = False
+                is_ff_exercise = False
+            curr = row
+            
+                
+                
+start_time = time.time()
+
+abstract("cs5040_spring_2021") 
+print(total_session_count, total_session_time)
+print("--- %s seconds ---" % (time.time() - start_time))
+
+# SQL commands for pulling Fall 2020 CS4114 data
+# SELECT oui.id, oui.user_id, oui.inst_book_id, oui.name, oui.description, oui.action_time, oui.inst_chapter_module_id, exercise.inst_section_id, exercise.inst_exercise_id, ex.short_name, ex.ex_type
+# FROM opendsa.odsa_user_interactions oui
+# LEFT JOIN opendsa.inst_book_section_exercises exercise ON oui.inst_book_section_exercise_id = exercise.id
+# LEFT JOIN opendsa.inst_exercises ex ON exercise.inst_exercise_id = ex.id
+# WHERE oui.inst_book_id = 852 Order by action_time ASC;
-- 
GitLab