-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmetrics.py
1422 lines (1174 loc) · 49.1 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
import pandas as pd
import numpy as np
class Runtime:
def __init__(self, legacy=False):
self.schema = 'legacy' if legacy else 'current'
self.id_field = 'user_id' if legacy else 'aai_uid'
self.users = None
self.items = None
self.user_actions = None
self.user_actions_all = None
self.recommendations = None
self.categories = None
self.scientific_domains = None
self.provider = None
self.errors = []
# decorator to add the text attribute to function as major metric
def metric(txt):
def wrapper(f):
f.kind = "metric"
f.doc = txt
return f
return wrapper
# decorator to add the text attribute to function
def statistic(txt):
def wrapper(f):
f.kind = "statistic"
f.doc = txt
return f
return wrapper
# decorator to continue the procedure
# after fatal error in statistic/metric calculation
def pass_on_error(func):
def wrapper(*args, **kwargs):
try:
result = func(*args, **kwargs)
except Exception as e:
print('Error occurred in: {}. "{}"'.format(func.__name__, str(e)))
# find the object which contains the errors variable
# append it with function names for those that exceptions occurred
_args = list(filter(lambda x: isinstance(x, Runtime), args))
if _args:
_args[0].errors.append(func.__name__)
return None
return result
return wrapper
# Metrics
@statistic("The initial date where metrics are calculated on")
@pass_on_error
def start(object):
"""
Calculate the start date where metrics are calculated on
found in min value between Pandas DataFrame object user_action
and recommendation
"""
return str(
min(
min(object.user_actions["timestamp"]),
min(object.recommendations["timestamp"]),
)
)
@statistic("The final date where metrics are calculated on")
@pass_on_error
def end(object):
"""
Calculate the end date where metrics are calculated on
found in max value between Pandas DataFrame object user_action
and recommendation
"""
return str(
max(
max(object.user_actions["timestamp"]),
max(object.recommendations["timestamp"]),
)
)
@statistic("The total number of unique registered users in the system")
@pass_on_error
def users(object):
"""
Calculate the total number of unique users
found in Pandas DataFrame object users (if provided)
or user_actions otherwise
"""
return int(object.users["id"].nunique())
@statistic("The total number of unique registered users in the system")
@pass_on_error
def registered_users(object):
"""
Calculate the total number of unique users
found in Pandas DataFrame object users (if provided)
or user_actions otherwise
"""
return object.user_actions[object.user_actions["registered"]][
object.id_field].nunique()
@statistic("The total number of unique anonymous users in the system")
@pass_on_error
def anonymous_users(object):
"""
Calculate the total number of unique users
found in Pandas DataFrame object users (if provided)
or user_actions otherwise
"""
return users(object)-registered_users(object)
@statistic("The number of unique published items in the evaluated RS")
@pass_on_error
def items(object):
"""
Calculate the number of unique items
found in Pandas DataFrame object items (if provided)
or user_actions otherwise (from both Source and Target item)
"""
return int(object.items["id"].nunique())
@statistic("The number of recommended items in the evaluated RS")
@pass_on_error
def recommended_items(object):
"""
Calculate the number of recommended items
found in Pandas DataFrame object recommendations
"""
return len(object.recommendations.index)
@statistic("The total number of user actions")
@pass_on_error
def user_actions_all(object):
"""
Calculate the total number of user_actions
found in Pandas DataFrame object user_actions
"""
return len(object.user_actions_all.index)
@statistic("The number of filtered user actions")
@pass_on_error
def user_actions(object):
"""
Calculate the number of filtered user_actions
found in Pandas DataFrame object user_actions
"""
return len(object.user_actions.index)
@statistic("The number of filtered user actions occurred by registered users")
@pass_on_error
def user_actions_registered(object):
"""
Calculate the number of filtered user_actions occurred by registered users
found in Pandas DataFrame object user_actions
"""
return len(object.user_actions[object.user_actions["registered"]].index)
@statistic("The number of filtered user actions occurred by anonymous users")
@pass_on_error
def user_actions_anonymous(object):
"""
Calculate the number of filtered user_actions occurred by anonymous users
found in Pandas DataFrame object user_actions
"""
return user_actions(object) - user_actions_registered(object)
@statistic(
"The percentage (%) of filtered user actions occurred by registered users "
"to the total user actions"
)
@pass_on_error
def user_actions_registered_perc(object):
"""
Calculate the percentage (%) of filtered user actions occurred
by registered users to the total user actions
found in Pandas DataFrame object user_actions (in two decimals)
"""
return round((user_actions_registered(object) * 100.0
/ user_actions(object)), 2)
@statistic(
"The percentage (%) of filtered user actions occurred by anonymous users "
"to the total user actions"
)
@pass_on_error
def user_actions_anonymous_perc(object):
"""
Calculate the percentage (%) of filtered user actions occurred
by anonymous users to the total user actions
found in Pandas DataFrame object user_actions (in two decimals)
"""
return round(100.0 - user_actions_registered_perc(object), 2)
@statistic("The total number of item views by the users")
@pass_on_error
def item_views_all(object):
"""
Calculate the total number of user_actions led to item views
found in Pandas DataFrame object user_actions
"""
# if target path is the search page remove it
# if the main page of the source and target paths are the same
# then remove it because it's a walk around the service
# items apart from services have not walk around implementations
_df = object.user_actions_all[
(object.user_actions_all["target_resource_id"] != -1)
& (object.user_actions_all["target_resource_id"] != '-1')
& (object.user_actions_all["target_resource_id"] is not None)
].copy()
if object.schema == 'legacy':
pattern = r"/services/([^/]+)/"
_df = _df[_df["target_path"].str.match(pattern) &
~_df["target_path"].str.startswith("/services/c/")]
else:
pattern = r"search%2F(?:all|dataset|software|service" + \
r"|data-source|training|guideline|other)"
_df = _df[~_df["target_path"].str.match(pattern)]
_df['source'] = _df['source_path'].str.extract(r"/services/(.*?)/")
_df['target'] = _df['target_path'].str.extract(r"/services/(.*?)/")
_df = _df[_df['source'] != _df['target']]
return len(_df.index)
@statistic("The number of filtered item views by the users")
@pass_on_error
def item_views(object):
"""
Calculate the number of filtered user_actions led to item views
found in Pandas DataFrame object user_actions
"""
# if target path is the search page remove it
# if the main page of the source and target paths are the same
# then remove it because it's a walk around the service
# items apart from services have not walk around implementations
_df = object.user_actions[
(object.user_actions["target_resource_id"] != -1)
& (object.user_actions["target_resource_id"] != '-1')
& (object.user_actions["target_resource_id"] is not None)
].copy()
if object.schema == 'legacy':
pattern = r"/services/([^/]+)/"
_df = _df[_df["target_path"].str.match(pattern) &
~_df["target_path"].str.startswith("/services/c/")]
else:
pattern = r"search%2F(?:all|dataset|software|service" + \
r"|data-source|training|guideline|other)"
_df = _df[~_df["target_path"].str.match(pattern)]
_df['source'] = _df['source_path'].str.extract(r"/services/(.*?)/")
_df['target'] = _df['target_path'].str.extract(r"/services/(.*?)/")
_df = _df[_df['source'] != _df['target']]
return len(_df.index)
@statistic("The number of item views by the registered users")
@pass_on_error
def item_views_registered(object):
"""
Calculate the number of user_actions led by registered users
led to item views found in Pandas DataFrame object user_actions
"""
# if target path is the search page remove it
# if the main page of the source and target paths are the same
# then remove it because it's a walk around the service
# items apart from services have not walk around implementations
_df = object.user_actions[
(object.user_actions["target_resource_id"] != -1)
& (object.user_actions["target_resource_id"] != '-1')
& (object.user_actions["target_resource_id"] is not None)
].copy()
if object.schema == 'legacy':
pattern = r"/services/([^/]+)/"
_df = _df[_df["target_path"].str.match(pattern) &
~_df["target_path"].str.startswith("/services/c/")]
else:
pattern = r"search%2F(?:all|dataset|software|service" + \
r"|data-source|training|guideline|other)"
_df = _df[~_df["target_path"].str.match(pattern)]
_df['source'] = _df['source_path'].str.extract(r"/services/(.*?)/")
_df['target'] = _df['target_path'].str.extract(r"/services/(.*?)/")
_df = _df[_df['source'] != _df['target']]
_df = _df[_df['registered']]
return len(_df.index)
@statistic("The number of item views by the anonymous users")
@pass_on_error
def item_views_anonymous(object):
"""
Calculate the number of user_actions led by anonymous users
led to item views found in Pandas DataFrame object user_actions
"""
return item_views(object) - item_views_registered(object)
@statistic(
"The percentage (%) of user_actions led by registered users to item views"
)
@pass_on_error
def item_views_registered_perc(object):
"""
Calculate the percentage (%) of user_actions led by registered users to
item views found in Pandas DataFrame object user_actions (in two decimals)
"""
try:
return round((item_views_registered(object) * 100.0 /
item_views(object)), 2)
except ZeroDivisionError:
return 0
@statistic(
"The percentage (%) of user_actions led by anonymous users to item views"
)
@pass_on_error
def item_views_anonymous_perc(object):
"""
Calculate the percentage (%) of user_actions led by anonymous users to
item views found in Pandas DataFrame object user_actions (in two decimals)
"""
return round(100.0 - item_views_registered_perc(object), 2)
@statistic("The total number of unique recommended items")
@pass_on_error
def total_unique_recommended_items(object):
"""
Calculate the total number of unique items found in recommendations
"""
return int(object.recommendations.nunique()["resource_id"])
@statistic("The total number of unique users found in recommendations")
@pass_on_error
def total_unique_users_recommended(object):
"""
Calculate the total number of unique users found in recommendations
"""
return int(object.recommendations.nunique()[object.id_field])
@statistic("A dictionary of the number of user actions per day")
@pass_on_error
def user_actions_per_day(object):
"""
It returns a statistical report in dictionary format. Specifically, the key
is set for each particular day found and its value contains the respective
number of user_actions committed. The dictionary includes all in-between
days (obviously, with the count set to zero). User_actions are already
filtered by those where the user or item does not exist in users'
or items' catalogs.
"""
# Since user_actions is in use, user actions when user
# or item does not exist in users' or items'
# catalogs have been removed
# count user_actions for each day found in entries
res = (
object.user_actions.groupby(by=object.user_actions["timestamp"]
.dt.date)
.count()
.iloc[:, 0]
)
# create a Series with period's start and end times and value of 0
init = pd.Series(
[0, 0],
index=[
pd.to_datetime(start(object)).date(),
pd.to_datetime(end(object)).date(),
],
)
# remove duplicate entries for corner cases where start and end time match
init.drop_duplicates(keep="first", inplace=True)
# append above two indexes and values (i.e. 0) to the Series
# with axis=1, same indexes are being merged
# since dataframe is created, get the first column
res = pd.concat([res, init], ignore_index=False, axis=1).iloc[:, 0]
# convert Nan values created by the concatenation to 0
# and change data type back to int
res = res.fillna(0).astype(int)
# fill the in between days with zero user_actions
res = res.asfreq("D", fill_value=0)
# convert datetimeindex to string
res.index = res.index.format()
# convert series to dataframe with extra column having the dates
res = res.to_frame().reset_index()
# rename columns to date, value
res.rename(columns={res.columns[0]: "date", res.columns[1]: "value"},
inplace=True)
# return a list of objects with date and value fields
return res.to_dict(orient="records")
@statistic("A dictionary of the number of user actions per month")
@pass_on_error
def user_actions_per_month(object):
"""
It returns a statistical report in dictionary format. Specifically, the key
is set for each specific month found and its value contains the respective
number of user_actions committed. The dictionary includes all in-between
months (obviously, with the count set to zero). User_actions are already
filtered by those where the user or item does not exist in users'
or items' catalogs.
"""
# Since user_actions is in use, user actions when user
# or item does not exist in users' or items'
# catalogs have been removed
# count user_actions for each day found in entries
res = (
object.user_actions.groupby(by=object.user_actions["timestamp"]
.dt.date)
.count()
.iloc[:, 0]
)
# create a Series with period's start and end times and value of 0
init = pd.Series(
[0, 0],
index=[
pd.to_datetime(start(object)).date(),
pd.to_datetime(end(object)).date(),
],
)
# remove duplicate entries for corner cases where start and end time match
init.drop_duplicates(keep="first", inplace=True)
# append above two indexes and values (i.e. 0) to the Series
# with axis=1, same indexes are being merged
# since dataframe is created, get the first column
res = pd.concat([res, init], ignore_index=False, axis=1).iloc[:, 0]
# convert Nan values created by the concatenation to 0
# and change data type back to int
res = res.fillna(0).astype(int)
# fill the in between days with zero user_actions
res = res.asfreq("D", fill_value=0)
# resample results in Monthly granularity
res = res.resample('M').sum()
# convert datetimeindex to string
res.index = res.index.format()
# convert series to dataframe with extra column having the dates
res = res.to_frame().reset_index()
# rename columns to date, value
res.rename(columns={res.columns[0]: "date", res.columns[1]: "value"},
inplace=True)
# keep YYYY-MM format in date field
res['date'] = res['date'].str[:-3]
# return a list of objects with date and value fields
return res.to_dict(orient="records")
@statistic("A dictionary of the number of recommended items per day")
@pass_on_error
def recommended_items_per_day(object):
"""
It returns a a timeseries of recommended item counts per day.
Each timeseries item has two fields: date and value
"""
# count recommendations for each day found in entries
res = (
object.recommendations.groupby(by=object
.recommendations["timestamp"].dt.date)
.count()
.iloc[:, 0]
)
# create a Series with period's start and end times and value of 0
init = pd.Series(
[0, 0],
index=[
pd.to_datetime(start(object)).date(),
pd.to_datetime(end(object)).date(),
],
)
# remove duplicate entries for corner cases where start and end time match
init.drop_duplicates(keep="first", inplace=True)
# append above two indexes and values (i.e. 0) to the Series
# with axis=1, same indexes are being merged
# since dataframe is created, get the first column
res = pd.concat([res, init], ignore_index=False, axis=1).iloc[:, 0]
# convert Nan values created by the concatenation to 0
# and change data type back to int
res = res.fillna(0).astype(int)
# fill the in between days with zero user_actions
res = res.asfreq("D", fill_value=0)
# convert datetimeindex to string
res.index = res.index.format()
# convert series to dataframe with extra column having the dates
res = res.to_frame().reset_index()
# rename columns to date, value
res.rename(columns={res.columns[0]: "date", res.columns[1]: "value"},
inplace=True)
# return a list of objects with date and value fields
return res.to_dict(orient="records")
@statistic("A dictionary of the number of recommended items per month")
@pass_on_error
def recommended_items_per_month(object):
"""
It returns a a timeseries of recommended item counts per month.
Each timeseries item has two fields: date and value
"""
# count recommendations for each day found in entries
res = (
object.recommendations.groupby(by=object
.recommendations["timestamp"].dt.date)
.count()
.iloc[:, 0]
)
# create a Series with period's start and end times and value of 0
init = pd.Series(
[0, 0],
index=[
pd.to_datetime(start(object)).date(),
pd.to_datetime(end(object)).date(),
],
)
# remove duplicate entries for corner cases where start and end time match
init.drop_duplicates(keep="first", inplace=True)
# append above two indexes and values (i.e. 0) to the Series
# with axis=1, same indexes are being merged
# since dataframe is created, get the first column
res = pd.concat([res, init], ignore_index=False, axis=1).iloc[:, 0]
# convert Nan values created by the concatenation to 0
# and change data type back to int
res = res.fillna(0).astype(int)
# fill the in between days with zero user_actions
res = res.asfreq("D", fill_value=0)
# resample results in Monthly granularity
res = res.resample('M').sum()
# convert datetimeindex to string
res.index = res.index.format()
# convert series to dataframe with extra column having the dates
res = res.to_frame().reset_index()
# rename columns to date, value
res.rename(columns={res.columns[0]: "date", res.columns[1]: "value"},
inplace=True)
# keep YYYY-MM format in date field
res['date'] = res['date'].str[:-3]
# return a list of objects with date and value fields
return res.to_dict(orient="records")
@metric("The percentage (%) of unique items to the total number "
"of items")
@pass_on_error
def catalog_coverage(object):
"""
Calculate the percentage (%) of unique items
found to the total number of items
"""
return round((total_unique_recommended_items(object) * 100.0 /
items(object)), 2)
@metric("The percentage (%) of unique users to the total number of users")
@pass_on_error
def user_coverage(object):
"""
Calculate the percentage (%) of unique users to the total number of users
"""
return round((total_unique_users_recommended(object) * 100.0 /
users(object)), 2)
@metric(
"The ratio of user hits divided by the total number of users "
"(user hit: a user that has accessed at least one item "
"that is also a personal recommendation)"
)
@pass_on_error
def hit_rate(object):
"""
1) For each user get the recommended items and the items the user
accessed
2) Check if the user has at least one accessed item in recommendations
3) If yes increase number of hits by one
4) Divide by the total number of users
"""
# object.users contains already only the registered ones
# a matrix of User ids and the respective accessed items' ids
access_df = object.users[["id", "accessed_resources"]]
# a matrix of User ids and the respective recommended items' ids
rec_df = (
object.recommendations[[object.id_field, "resource_id"]]
.groupby([object.id_field])
.agg({"resource_id": lambda x: x.unique().tolist()})
.reset_index()
)
# performs a left join on User id, which means that nan values
# are set for cases where no recommendations were made
data = pd.merge(access_df, rec_df, left_on="id", right_on=object.id_field,
how="inner")
# calculate hits per user
# performs an interection of access and recommended items per user (row)
data['intersect'] = data.apply(lambda row: list(set(
row['accessed_resources']).intersection(row['resource_id'])), axis=1)
# hits = the length of the intersection
data['intersect_len'] = data['intersect'].apply(len)
# calculate the average value
total_hits = data['intersect_len'].sum()
return round(total_hits/len(object.users), 5)
@metric(
"The number of user clicks through recommendations panels divided by the "
"total times recommendation panels were presented to users. "
"Takes into account all historical data of user actions"
)
@pass_on_error
def click_through_rate(object):
"""
Get only the user actions that present a recommendation panel to the user
in the source page
Those are actions with the following source paths:
- /services
- /services/
- /services/c/{any category name}
1) Count the items in above list as they represent the times
recommendations panels were
presented to the users of the portal
2) Narrow the above list into a new subset by selecting only user actions
that originate
from a recommendation panel
3) Those are actions that have the 'recommendation' string in the
Action column
4) Count the items in the subset as they represent the times users clicked
through recommendations
5) Divide the items of the subset with the items of the first list to get
the click-through rate
"""
# get user actions
if object.schema == 'legacy':
user_actions_recpanel_views = object.user_actions[
object.user_actions['source_path'].isin(
['/services', '/services/']
) |
object.user_actions['source_path'].str.startswith('/services/c/')
]
else:
user_actions_recpanel_views = object.user_actions[
object.user_actions['source_path'].str.startswith('search%2F')
]
user_actions_recpanel_clicks = user_actions_recpanel_views[
user_actions_recpanel_views['panel'] == 'recommendation_panel'
]
try:
return round(
len(user_actions_recpanel_clicks)
/ len(user_actions_recpanel_views), 2
)
except ZeroDivisionError:
return 0.00
@metric(
"The diversity of the recommendations according to Shannon Entropy. "
"The entropy is 0 when a single item is always chosen or recommended, "
"and log n when n items are chosen or recommended equally often."
)
@pass_on_error
def diversity(object, anonymous=False):
"""
Calculate Shannon Entropy. The entropy is 0 when a single item is always
chosen or recommended, and log n when n items are chosen or recommended
equally often.
"""
# keep recommendations with or without anonymous suggestions
# based on anonymous flag (default=False, i.e. ignore anonymous)
if anonymous:
recs = object.recommendations
else:
recs = object.recommendations[
(object.recommendations[object.id_field]
.find_registered(object.schema))
]
# this variable keeps the sum of user_norm (where user_norm is
# the count of how many times a User has been suggested)
# however since no cutoff at per user recommendations is applied and
# also since each recommendation entry is one-to-one <user id> <item id>
# then the total number of recommendations is equal to this sum
# remember that recommendations have been previously filtered based
# on the existance of users and items
# item_count
# group recommendations entries by item id and
# then count how many times each item has been suggested
gr_item = recs.groupby(["resource_id"]).count()
# create a dictionary of item_count in order to
# map the item id to the respective item_count
# key=<item id> and value=<item_count>
d_item = gr_item[object.id_field].to_dict()
# each element represent the item's recommendations occurance
# e.g. [1,6,7]
# a item was recommended 1 time, another 6 times and another 7 times
items = np.array(list(d_item.values()))
# the total number of recommendations
n_recommendations = items.sum()
# element-wise computations
# (division for each item's recommendations occurance)
recommended_probability = items / n_recommendations
# H=-Sum(p*logp) [element-wise]
shannon_entropy = -np.sum(
recommended_probability * np.log2(recommended_probability)
)
return round(shannon_entropy, 4)
@metric(
"The diversity of the recommendations according to GiniIndex. "
"The index is 0 when all items are "
"chosen equally often, and 1 when a single item is always chosen."
)
@pass_on_error
def diversity_gini(object, anonymous=False):
"""
Calculate GiniIndex based on
https://elliot.readthedocs.io/en/latest/_modules/elliot/evaluation
/metrics/diversity/gini_index/gini_index.html#GiniIndex
(see book https://link.springer.com/10.1007/978-1-4939-7131-2_110158)
"""
# keep recommendations with or without anonymous suggestions
# based on anonymous flag (default=False, i.e. ignore anonymous)
if anonymous:
recs = object.recommendations
else:
recs = object.recommendations[
(object.recommendations[object.id_field]
.find_registered(object.schema))
]
# this variable keeps the sum of user_norm (where user_norm is
# the count of how many times a User has been suggested)
# however since no cutoff at per user recommendations is applied and
# also since each recommendation entry is one-to-one <user id> <item id>
# then the total number of recommendations is equal to this sum
free_norm = len(recs.index)
# item_count
# group recommendations entries by item id and
# then count how many times each item has been suggested
gr_item = recs.groupby(["resource_id"]).count()
# create a dictionary of item_count in order to
# map the item id to the respective item_count
# key=<item id> and value=<item_count>
d_item = gr_item[object.id_field].to_dict()
# total number of recommended itemss
n_recommended_items = len(d_item)
# total number of items
num_items = items(object)
# create a zero list
# to calculate gini index including elements with 0 occurance
zeros = [0] * (num_items - n_recommended_items)
gini = sum(
[
(2 * (j + 1) - num_items - 1) * (cs / free_norm)
for j, cs in enumerate(zeros + sorted(d_item.values()))
]
)
gini /= num_items - 1
return round(gini, 4)
@metric("The novelty expresses how often new and unseen items are"
" recommended to users")
@pass_on_error
def novelty(object):
"""Calculate novelty of recommendations
using the n=SUM(-log(p(i)))/|R| formula"""
# published items
items_pub = object.items["id"]
# recommended items to authenticated users
items_rec = (object
.recommendations[object.recommendations[object.id_field]
.find_registered(
object.schema)]["resource_id"])
# items that are published and recommended
items_recpub = items_rec[items_rec
.isin(items_pub)].drop_duplicates()
# user actions
ua = object.user_actions
# user actions filtered if src and target the same. Also filter out
# if target equals -1 and filter out anonymous users
ua_serv_view = ua[
(ua["source_resource_id"] != ua["target_resource_id"])
& (ua["target_resource_id"] != -1)
& (ua["target_resource_id"] != '-1')
& (ua["target_resource_id"] is not None)
& (ua[object.id_field].find_registered(object.schema))
]
# count item views by item id (sorted by item id)
items_viewed = (ua_serv_view["target_resource_id"]
.value_counts().sort_index())
# create a table for each recommended item with columns
# for number of views, p(i) and -log(pi)
r_items = pd.DataFrame(index=items_recpub).sort_index()
# add views column to assign views to each recommended item
r_items["views"] = items_viewed
# count the total item views in order to compute the portions p(i)
total_views = r_items["views"].sum()
# count the total recommended items |R|
total_items = len(r_items)
# compute the p(i) of each recommeneded item
r_items["pi"] = r_items["views"] / total_views
# calculate the negative log of the p(i).
r_items["-logpi"] = -np.log2(r_items["pi"])
# calculate novelty based on formula n=SUM(-log(p(i)))/|R|
novelty = r_items["-logpi"].sum() / total_items
return round(novelty, 4)
@metric(
"The mean value of the accuracy score found for each user defined by the "
"fraction of the number of the correct predictions by the total number "
"of predictions"
)
@pass_on_error
def accuracy(object):
"""
Calculate the accuracy score found for each and retrieve the mean value.
The score is calculated by dividing the number of the correct predictions
by the total number of predictions.
"""
# a list of unique items' ids found in Datastore
items_list = object.items["id"].unique().tolist()
# the length of the above value
len_items = items(object)
def score(x):
"""
Inner function called at each row of the final dataframe
in order to calculate the accuracy score for each row (=user)
"""
# 'Items' header indicates the accessed items' list,
# while the 'Items' header indicates the recommended items' list
# if accessed or recommended items' list is empty
# it does not calculate any further computations
# else for each item found in items_list,
# put 1 or 0 if it is also found in the accessed or
# recommended items respectively
if not x["accessed_resources"]:
true_values = np.array([0] * len_items)
else:
true_values = np.array(
list(map(lambda s: 1 if s in x["accessed_resources"] else 0,
items_list))
)
if not x["resource_id"]:
pred_values = np.array([0] * len_items)
else:
pred_values = np.array(
list(map(lambda s: 1 if s in x["resource_id"] else 0,
items_list))
)
# Calculate the accuracy score by computing the average of the
# returned array.
# The returned array is a True/False array when the respective
# element of true_values is equal or not to the respective
# element of pred_values
x["resource_id"] = np.average(true_values == pred_values)
# return the row, where the 'resources' column has the accuracy
# score now
return x
# a matrix of User ids and the respective accessed items' ids
access_df = object.users[["id", "accessed_resources"]]
# a matrix of User ids and the respective recommended items' ids
rec_df = (
object.recommendations[[object.id_field, "resource_id"]]
.groupby([object.id_field])
.agg({"resource_id": lambda x: x.unique().tolist()})
.reset_index()
)
# performs a left join on User id, which means that nan values
# are set for cases where no recommendations were made
data = pd.merge(access_df, rec_df, left_on="id", right_on=object.id_field,
how="left")
# convert nan values to zeros, in order to be handled easily
# by the inner function
data.fillna(0, inplace=True)
# apply the score function row-wise
data = data.apply(score, axis=1)
# return the mean value of all users' accuracy score
# up to 4 digits precision
return round(data["resource_id"].mean(), 4)