From 1dd3bd0b35b6e1b01719c3f4726a3eaf06ee8875 Mon Sep 17 00:00:00 2001 From: Samuele Kaplun Date: Mon, 28 May 2018 22:41:16 +0200 Subject: [PATCH] Custom dimensions * Integrate Custom dimensions, as a replacement for Custom Variables. (Closes #188) Signed-off-by: Samuele Kaplun --- import_logs.py | 150 ++++++++++++++++++++++++++++++++++++--------- tests/test_main.py | 49 ++++++++++----- 2 files changed, 153 insertions(+), 46 deletions(-) diff --git a/import_logs.py b/import_logs.py index 7c73c5c..a5b1776 100755 --- a/import_logs.py +++ b/import_logs.py @@ -27,6 +27,7 @@ import codecs import datetime import fnmatch +import functools import gzip import hashlib import http.client @@ -816,10 +817,10 @@ def _create_parser(self): parser.add_argument( '--w3c-field-regex', action=StoreDictKeyPair, metavar='KEY=VAL', default={}, dest="w3c_field_regexes", type=str, help="Specify a regex for a field in your W3C extended log file. You can use this option to parse fields the " - "importer does not natively recognize and then use one of the --regex-group-to-XXX-cvar options to track " - "the field in a custom variable. For example, specifying --w3c-field-regex=sc-win32-status=(?P\\S+) " - "--regex-group-to-page-cvar=\"win32_status=Windows Status Code\" will track the sc-win32-status IIS field " - "in the 'Windows Status Code' custom variable. Regexes must contain a named group." + "importer does not natively recognize and then use one of the --regex-group-to-XXX-cdim options to track " + "the field in a custom dimension. For example, specifying --w3c-field-regex=sc-win32-status=(?P\\S+) " + "--regex-group-to-page-cdim=\"win32_status=Windows Status Code\" will track the sc-win32-status IIS field " + "in the 'Windows Status Code' custom dimension. Regexes must contain a named group." ) parser.add_argument( '--title-category-delimiter', dest='title_category_delimiter', default='/', @@ -840,22 +841,29 @@ def _create_parser(self): "disable normal user id tracking. See documentation for --log-format-regex for list of available " "regex groups." ) - parser.add_argument( - '--regex-group-to-visit-cvar', action=StoreDictKeyPair, metavar='KEY=VAL',dest='regex_group_to_visit_cvars_map', default={}, - help="Track an attribute through a custom variable with visit scope instead of through Matomo's normal " - "approach. For example, to track usernames as a custom variable instead of through the uid tracking " - "parameter, supply --regex-group-to-visit-cvar=\"userid=User Name\". This will track usernames in a " - "custom variable named 'User Name'. The list of available regex groups can be found in the documentation " + '--regex-group-to-visit-cvar', action=StoreDictKeyPair, metavar='KEY=VAL', dest='regex_group_to_visit_cvars_map', default={}, + help="DEPRECATED" + ) + parser.add_argument( + '--regex-group-to-page-cvar', action=StoreDictKeyPair, metavar='KEY=VAL', dest='regex_group_to_page_cvars_map', default={}, + help="DEPRECATED" + ) + parser.add_argument( + '--regex-group-to-visit-cdim', action=StoreDictKeyPair, metavar='KEY=VAL', dest='regex_group_to_visit_cdims_map', default={}, + help="Track an attribute through a custom dimension with visit scope instead of through Matomo's normal " + "approach. For example, to track usernames as a custom dimension instead of through the uid tracking " + "parameter, supply --regex-group-to-visit-cdim=\"userid=User Name\". This will track usernames in a " + "custom dimension named 'User Name'. The list of available regex groups can be found in the documentation " "for --log-format-regex (additional regex groups you may have defined " "in --log-format-regex can also be used)." ) parser.add_argument( - '--regex-group-to-page-cvar', action=StoreDictKeyPair, metavar='KEY=VAL', dest='regex_group_to_page_cvars_map', default={}, - help="Track an attribute through a custom variable with page scope instead of through Matomo's normal " - "approach. For example, to track usernames as a custom variable instead of through the uid tracking " - "parameter, supply --regex-group-to-page-cvar=\"userid=User Name\". This will track usernames in a " - "custom variable named 'User Name'. The list of available regex groups can be found in the documentation " + '--regex-group-to-action-cdim', action=StoreDictKeyPair, metavar='KEY=VAL', dest='regex_group_to_action_cdims_map', default={}, + help="Track an attribute through a custom dimension with action scope instead of through Matomo's normal " + "approach. For example, to track usernames as a custom dimension instead of through the uid tracking " + "parameter, supply --regex-group-to-action-cdim=\"userid=User Name\". This will track usernames in a " + "custom dimension named 'User Name'. The list of available regex groups can be found in the documentation " "for --log-format-regex (additional regex groups you may have defined " "in --log-format-regex can also be used)." ) @@ -1782,6 +1790,49 @@ def check_format(self, format): "specify the Matomo site ID with the --idsite argument" ) + +class CustomDimensions: + """ + Utility to manage custom dimensions. + """ + dimensions = {} + + def __init__(self): + self.lock = threading.RLock() + + def pull_dimensions(self, site_id): + self.lock.acquire() + try: + dimensions = matomo.call_api('CustomDimensions.getConfiguredCustomDimensions', idSite=site_id) + for dimension in dimensions: + if dimension['active']: + self.dimensions.setdefault(int(site_id), {})[(dimension['scope'], dimension['name'])] = int(dimension['idcustomdimension']) + finally: + self.lock.release() + + def create_new_dimension(self, site_id, scope, name): + self.lock.acquire() + try: + return matomo.call_api('CustomDimensions.configureNewCustomDimension', idSite=site_id, scope=scope, name=name, active=1) + finally: + self.lock.release() + + def get_custom_dimension_id(self, site_id, scope, name): + if self.dimensions.get(int(site_id)) is None: + self.pull_dimensions(site_id) + dimension_id = self.dimensions.get(int(site_id), {}).get((scope, name)) + + if dimension_id: + return dimension_id + self.lock.acquire() + try: + dimension_id = self.create_new_dimension(site_id, scope, name)['value'] + self.pull_dimensions(site_id) + return dimension_id + finally: + self.lock.release() + + class Recorder: """ A Recorder fetches hits from the Queue and inserts them into Matomo using @@ -1910,11 +1961,11 @@ def _get_hit_args(self, hit): # handle custom variables before generating args dict if config.options.enable_bots: if hit.is_robot: - hit.add_visit_custom_var("Bot", hit.user_agent) + hit.add_visit_custom_dimension(site_id, "Bot", hit.user_agent) else: - hit.add_visit_custom_var("Not-Bot", hit.user_agent) + hit.add_visit_custom_dimension(site_id, "Not-Bot", hit.user_agent) - hit.add_page_custom_var("HTTP-code", hit.status) + hit.add_action_custom_dimension(site_id, "HTTP-code", hit.status) args = { 'rec': '1', @@ -2092,6 +2143,22 @@ def get_visitor_id_hash(self): return abs(hash(visitor_id)) + def add_action_custom_dimension(self, site_id, key, value): + """ + Adds a page custom dimension to this Hit. + """ + self._add_custom_dimension(site_id, key, value, 'action') + + def add_visit_custom_dimension(self, site_id, key, value): + """ + Adds a visit custom dimension to this Hit. + """ + self._add_custom_dimension(site_id, key, value, 'visit') + + def _add_custom_dimension(self, site_id, key, value, scope): + dimension_id = custom_dimensions.get_custom_dimension_id(site_id, scope, key) + self.args['dimension%s' % dimension_id] = value + def add_page_custom_var(self, key, value): """ Adds a page custom variable to this Hit. @@ -2434,23 +2501,16 @@ def filtered_line(line, reason): args={}, ) + if config.options.regex_groups_to_ignore: + format.remove_ignored_groups(config.options.regex_groups_to_ignore) + + # FIXME: custom variables are deprecated... if config.options.regex_group_to_page_cvars_map: self._add_custom_vars_from_regex_groups(hit, format, config.options.regex_group_to_page_cvars_map, True) if config.options.regex_group_to_visit_cvars_map: self._add_custom_vars_from_regex_groups(hit, format, config.options.regex_group_to_visit_cvars_map, False) - if config.options.regex_groups_to_ignore: - format.remove_ignored_groups(config.options.regex_groups_to_ignore) - - # Add http method page cvar - try: - httpmethod = format.get('method') - if config.options.track_http_method and httpmethod != '-': - hit.add_page_custom_var('HTTP-method', httpmethod) - except: - pass - try: hit.query_string = format.get('query_string') hit.path = hit.full_path @@ -2562,6 +2622,22 @@ def filtered_line(line, reason): invalid_line(line, 'invalid timezone') continue + site_id, main_url = resolver.resolve(hit) + + if config.options.regex_group_to_action_cdims_map: + self._add_custom_dimension_from_regex_groups(site_id, hit, format, config.options.regex_group_to_action_cdims_map, 'action') + + if config.options.regex_group_to_visit_cdims_map: + self._add_custom_dimension_from_regex_groups(site_id, hit, format, config.options.regex_group_to_visit_cdims_map, 'visit') + + # Add http method page custom dimension + try: + httpmethod = format.get('method') + if config.options.track_http_method and httpmethod != '-': + hit.add_action_custom_dimension(site_id, 'HTTP-method', httpmethod) + except: + pass + if config.options.replay_tracking: # we need a query string and we only consider requests with piwik.php if not hit.query_string or not self.is_hit_for_tracker(hit): @@ -2620,6 +2696,21 @@ def _add_custom_vars_from_regex_groups(self, hit, format, groups, is_page_var): else: hit.add_visit_custom_var(custom_var_name, value) + def _add_custom_dimension_from_regex_groups(self, site_id, hit, format, groups, scope): + for group_name, custom_dim_name in groups.items(): + if group_name in format.get_all(): + value = format.get(group_name) + + # don't track the '-' empty placeholder value + if value == '-': + continue + + if scope == 'action': + hit.add_action_custom_dimension(site_id, custom_dim_name, value) + else: + hit.add_visit_custom_dimension(site_id, custom_dim_name, value) + + def main(): """ Start the importing process. @@ -2667,6 +2758,7 @@ def fatal_error(error, filename=None, lineno=None): stats = Statistics() resolver = config.get_resolver() parser = Parser() + custom_dimensions = CustomDimensions() main() sys.exit(0) except KeyboardInterrupt: diff --git a/tests/test_main.py b/tests/test_main.py index 69758c5..5b97c2d 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -177,6 +177,8 @@ def __init__(self): self.w3c_field_regexes = {} self.regex_group_to_visit_cvars_map = {} self.regex_group_to_page_cvars_map = {} + self.regex_group_to_visit_cdims_map = {} + self.regex_group_to_action_cdims_map = {} self.regex_groups_to_ignore = None self.replay_tracking_expected_tracker_file = 'piwik.php' self.debug_request_limit = None @@ -200,6 +202,9 @@ class Resolver(object): def check_format(self, format_): pass + def resolve(self, hit): + return 1, "https://example.org/" + class Recorder(object): """Mock recorder which collects hits but doesn't put their in database.""" recorders = [] @@ -208,6 +213,16 @@ class Recorder(object): def add_hits(cls, hits): cls.recorders.extend(hits) +import_logs.custom_dimensions = import_logs.CustomDimensions() +import_logs.custom_dimensions.dimensions[1] = { + ('visit', 'User Name'): 1, + ('visit', 'The Date'): 2, + ('action', 'Generation Time'): 3, + ('action', 'The Referrer'): 4, + ('action', 'HTTP-method'): 5 +} + + def test_replay_tracking_seconds_to_add_to_date(): """Test data parsing from sample log file.""" file_ = 'logs/logs_to_tests.log' @@ -478,7 +493,7 @@ def test_iis_custom_format(): assert hits[0]['extension'] == '/products/theproduct' assert hits[0]['is_download'] == False assert hits[0]['referrer'] == 'http://example.com/Search/SearchResults.pg?informationRecipient.languageCode.c=en' - assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}} + assert hits[0]['args'] == {'dimension5': 'GET'} assert hits[0]['generation_time_milli'] == 109 assert hits[0]['host'] == 'foo' assert hits[0]['filename'] == 'logs/iis_custom.log' @@ -497,7 +512,7 @@ def test_iis_custom_format(): assert hits[1]['extension'] == '/topic/hw43061' assert hits[1]['is_download'] == False assert hits[1]['referrer'] == '' - assert hits[1]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}} + assert hits[1]['args'] == {'dimension5': 'GET'} assert hits[1]['generation_time_milli'] == 0 assert hits[1]['host'] == 'foo' assert hits[1]['filename'] == 'logs/iis_custom.log' @@ -516,7 +531,7 @@ def test_iis_custom_format(): assert hits[2]['extension'] == '/hello/world/6,681965' assert hits[2]['is_download'] == False assert hits[2]['referrer'] == '' - assert hits[2]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}} + assert hits[2]['args'] == {'dimension5': 'GET'} assert hits[2]['generation_time_milli'] == 359 assert hits[2]['host'] == 'foo' assert hits[2]['filename'] == 'logs/iis_custom.log' @@ -554,7 +569,7 @@ def test_netscaler_parsing(): assert hits[0]['extension'] == 'jsp' assert hits[0]['is_download'] == False assert hits[0]['referrer'] == '' - assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}} + assert hits[0]['args'] == {'dimension5': 'GET'} assert hits[0]['generation_time_milli'] == 1000 assert hits[0]['host'] == 'foo' assert hits[0]['filename'] == 'logs/netscaler.log' @@ -752,7 +767,7 @@ def test_amazon_cloudfront_web_parsing(): assert hits[0]['extension'] == 'html' assert hits[0]['is_download'] == False assert hits[0]['referrer'] == 'https://example.com/' - assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}} + assert hits[0]['args'] == {'dimension5': 'GET'} assert hits[0]['generation_time_milli'] == 1.0 assert hits[0]['host'] == 'foo' assert hits[0]['filename'] == 'logs/amazon_cloudfront_web.log' @@ -836,7 +851,7 @@ def test_incapsulaw3c_parsing(): assert hits[0]['extension'] == 'php' assert hits[0]['is_download'] == False assert hits[0]['referrer'] == u'' - assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', u'"GET"']}} + assert hits[0]['args'] == {'dimension5': '"GET"'} assert hits[0]['length'] == 10117 assert hits[0]['generation_time_milli'] == 0 assert hits[0]['host'] == 'www.example.com' @@ -857,7 +872,7 @@ def test_incapsulaw3c_parsing(): assert hits[1]['extension'] == '/rss/news' assert hits[1]['is_download'] == False assert hits[1]['referrer'] == u'' - assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', u'"GET"']}} + assert hits[0]['args'] == {'dimension5': '"GET"'} assert hits[1]['length'] == 0 assert hits[1]['generation_time_milli'] == 0 assert hits[1]['host'] == 'www.example.com' @@ -982,8 +997,8 @@ def test_ignore_groups_option_removes_groups(): assert hits[0]['userid'] == None assert hits[0]['generation_time_milli'] == 0 -def test_regex_group_to_custom_var_options(): - """Test that the --regex-group-to-visit-cvar and --regex-group-to-page-cvar track regex groups to custom vars.""" +def test_regex_group_to_custom_dimensions_options(): + """Test that the --regex-group-to-visit-cdim and --regex-group-to-action-cdim track regex groups to custom vars.""" file_ = 'logs/iis.log' @@ -997,11 +1012,11 @@ def test_regex_group_to_custom_var_options(): import_logs.config.options.replay_tracking = False import_logs.config.options.w3c_time_taken_in_millisecs = True import_logs.config.options.regex_groups_to_ignore = set() - import_logs.config.options.regex_group_to_visit_cvars_map = { + import_logs.config.options.regex_group_to_visit_cdims_map = { 'userid': "User Name", 'date': "The Date" } - import_logs.config.options.regex_group_to_page_cvars_map = { + import_logs.config.options.regex_group_to_action_cdims_map = { 'generation_time_milli': 'Generation Time', 'referrer': 'The Referrer' } @@ -1009,10 +1024,10 @@ def test_regex_group_to_custom_var_options(): hits = [hit.__dict__ for hit in Recorder.recorders] - assert ['The Date', '2012-04-01 00:00:13'] in hits[0]['args']['_cvar'].values() - assert ['User Name', 'theuser'] in hits[0]['args']['_cvar'].values() - assert ['Generation Time', '1687'] in hits[0]['args']['cvar'].values() - assert ['HTTP-method', 'GET'] in hits[0]['args']['cvar'].values() + assert hits[0]['args']['dimension1'] == 'theuser' + assert hits[0]['args']['dimension2'] == '2012-04-01 00:00:13' + assert hits[0]['args']['dimension3'] == '1687' + assert hits[0]['args']['dimension5'] == 'GET' assert hits[0]['userid'] == 'theuser' assert hits[0]['date'] == datetime.datetime(2012, 4, 1, 0, 0, 13) @@ -1063,8 +1078,8 @@ def test_custom_log_date_format_option(): Recorder.recorders = [] import_logs.parser = import_logs.Parser() import_logs.config.options.w3c_field_regexes = None - import_logs.config.options.regex_group_to_visit_cvars_map = None - import_logs.config.options.regex_group_to_page_cvars_map = None + import_logs.config.options.regex_group_to_visit_cdims_map = None + import_logs.config.options.regex_group_to_action_cdims_map = None import_logs.config.options.log_format_regex = ( r'(?P\S+)\s+\S+\s+\S+\s+\[(?P.*?)\]\s+' r'"\S+\s+(?P.*?)\s+\S+"\s+(?P\S+)\s+(?P\S+)'