Skip to content

Commit

Permalink
Custom dimensions
Browse files Browse the repository at this point in the history
* Integrate Custom dimensions, as a replacement for Custom
  Variables. (Closes #188)

Signed-off-by: Samuele Kaplun <[email protected]>
  • Loading branch information
kaplun authored and sgiehl committed Nov 17, 2020
1 parent 2299a42 commit 1dd3bd0
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 46 deletions.
150 changes: 121 additions & 29 deletions import_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import codecs
import datetime
import fnmatch
import functools
import gzip
import hashlib
import http.client
Expand Down Expand Up @@ -816,10 +817,10 @@ def _create_parser(self):
parser.add_argument(
'--w3c-field-regex', action=StoreDictKeyPair, metavar='KEY=VAL', default={}, dest="w3c_field_regexes", type=str,
help="Specify a regex for a field in your W3C extended log file. You can use this option to parse fields the "
"importer does not natively recognize and then use one of the --regex-group-to-XXX-cvar options to track "
"the field in a custom variable. For example, specifying --w3c-field-regex=sc-win32-status=(?P<win32_status>\\S+) "
"--regex-group-to-page-cvar=\"win32_status=Windows Status Code\" will track the sc-win32-status IIS field "
"in the 'Windows Status Code' custom variable. Regexes must contain a named group."
"importer does not natively recognize and then use one of the --regex-group-to-XXX-cdim options to track "
"the field in a custom dimension. For example, specifying --w3c-field-regex=sc-win32-status=(?P<win32_status>\\S+) "
"--regex-group-to-page-cdim=\"win32_status=Windows Status Code\" will track the sc-win32-status IIS field "
"in the 'Windows Status Code' custom dimension. Regexes must contain a named group."
)
parser.add_argument(
'--title-category-delimiter', dest='title_category_delimiter', default='/',
Expand All @@ -840,22 +841,29 @@ def _create_parser(self):
"disable normal user id tracking. See documentation for --log-format-regex for list of available "
"regex groups."
)

parser.add_argument(
'--regex-group-to-visit-cvar', action=StoreDictKeyPair, metavar='KEY=VAL',dest='regex_group_to_visit_cvars_map', default={},
help="Track an attribute through a custom variable with visit scope instead of through Matomo's normal "
"approach. For example, to track usernames as a custom variable instead of through the uid tracking "
"parameter, supply --regex-group-to-visit-cvar=\"userid=User Name\". This will track usernames in a "
"custom variable named 'User Name'. The list of available regex groups can be found in the documentation "
'--regex-group-to-visit-cvar', action=StoreDictKeyPair, metavar='KEY=VAL', dest='regex_group_to_visit_cvars_map', default={},
help="DEPRECATED"
)
parser.add_argument(
'--regex-group-to-page-cvar', action=StoreDictKeyPair, metavar='KEY=VAL', dest='regex_group_to_page_cvars_map', default={},
help="DEPRECATED"
)
parser.add_argument(
'--regex-group-to-visit-cdim', action=StoreDictKeyPair, metavar='KEY=VAL', dest='regex_group_to_visit_cdims_map', default={},
help="Track an attribute through a custom dimension with visit scope instead of through Matomo's normal "
"approach. For example, to track usernames as a custom dimension instead of through the uid tracking "
"parameter, supply --regex-group-to-visit-cdim=\"userid=User Name\". This will track usernames in a "
"custom dimension named 'User Name'. The list of available regex groups can be found in the documentation "
"for --log-format-regex (additional regex groups you may have defined "
"in --log-format-regex can also be used)."
)
parser.add_argument(
'--regex-group-to-page-cvar', action=StoreDictKeyPair, metavar='KEY=VAL', dest='regex_group_to_page_cvars_map', default={},
help="Track an attribute through a custom variable with page scope instead of through Matomo's normal "
"approach. For example, to track usernames as a custom variable instead of through the uid tracking "
"parameter, supply --regex-group-to-page-cvar=\"userid=User Name\". This will track usernames in a "
"custom variable named 'User Name'. The list of available regex groups can be found in the documentation "
'--regex-group-to-action-cdim', action=StoreDictKeyPair, metavar='KEY=VAL', dest='regex_group_to_action_cdims_map', default={},
help="Track an attribute through a custom dimension with action scope instead of through Matomo's normal "
"approach. For example, to track usernames as a custom dimension instead of through the uid tracking "
"parameter, supply --regex-group-to-action-cdim=\"userid=User Name\". This will track usernames in a "
"custom dimension named 'User Name'. The list of available regex groups can be found in the documentation "
"for --log-format-regex (additional regex groups you may have defined "
"in --log-format-regex can also be used)."
)
Expand Down Expand Up @@ -1782,6 +1790,49 @@ def check_format(self, format):
"specify the Matomo site ID with the --idsite argument"
)


class CustomDimensions:
"""
Utility to manage custom dimensions.
"""
dimensions = {}

def __init__(self):
self.lock = threading.RLock()

def pull_dimensions(self, site_id):
self.lock.acquire()
try:
dimensions = matomo.call_api('CustomDimensions.getConfiguredCustomDimensions', idSite=site_id)
for dimension in dimensions:
if dimension['active']:
self.dimensions.setdefault(int(site_id), {})[(dimension['scope'], dimension['name'])] = int(dimension['idcustomdimension'])
finally:
self.lock.release()

def create_new_dimension(self, site_id, scope, name):
self.lock.acquire()
try:
return matomo.call_api('CustomDimensions.configureNewCustomDimension', idSite=site_id, scope=scope, name=name, active=1)
finally:
self.lock.release()

def get_custom_dimension_id(self, site_id, scope, name):
if self.dimensions.get(int(site_id)) is None:
self.pull_dimensions(site_id)
dimension_id = self.dimensions.get(int(site_id), {}).get((scope, name))

if dimension_id:
return dimension_id
self.lock.acquire()
try:
dimension_id = self.create_new_dimension(site_id, scope, name)['value']
self.pull_dimensions(site_id)
return dimension_id
finally:
self.lock.release()


class Recorder:
"""
A Recorder fetches hits from the Queue and inserts them into Matomo using
Expand Down Expand Up @@ -1910,11 +1961,11 @@ def _get_hit_args(self, hit):
# handle custom variables before generating args dict
if config.options.enable_bots:
if hit.is_robot:
hit.add_visit_custom_var("Bot", hit.user_agent)
hit.add_visit_custom_dimension(site_id, "Bot", hit.user_agent)
else:
hit.add_visit_custom_var("Not-Bot", hit.user_agent)
hit.add_visit_custom_dimension(site_id, "Not-Bot", hit.user_agent)

hit.add_page_custom_var("HTTP-code", hit.status)
hit.add_action_custom_dimension(site_id, "HTTP-code", hit.status)

args = {
'rec': '1',
Expand Down Expand Up @@ -2092,6 +2143,22 @@ def get_visitor_id_hash(self):

return abs(hash(visitor_id))

def add_action_custom_dimension(self, site_id, key, value):
"""
Adds a page custom dimension to this Hit.
"""
self._add_custom_dimension(site_id, key, value, 'action')

def add_visit_custom_dimension(self, site_id, key, value):
"""
Adds a visit custom dimension to this Hit.
"""
self._add_custom_dimension(site_id, key, value, 'visit')

def _add_custom_dimension(self, site_id, key, value, scope):
dimension_id = custom_dimensions.get_custom_dimension_id(site_id, scope, key)
self.args['dimension%s' % dimension_id] = value

def add_page_custom_var(self, key, value):
"""
Adds a page custom variable to this Hit.
Expand Down Expand Up @@ -2434,23 +2501,16 @@ def filtered_line(line, reason):
args={},
)

if config.options.regex_groups_to_ignore:
format.remove_ignored_groups(config.options.regex_groups_to_ignore)

# FIXME: custom variables are deprecated...
if config.options.regex_group_to_page_cvars_map:
self._add_custom_vars_from_regex_groups(hit, format, config.options.regex_group_to_page_cvars_map, True)

if config.options.regex_group_to_visit_cvars_map:
self._add_custom_vars_from_regex_groups(hit, format, config.options.regex_group_to_visit_cvars_map, False)

if config.options.regex_groups_to_ignore:
format.remove_ignored_groups(config.options.regex_groups_to_ignore)

# Add http method page cvar
try:
httpmethod = format.get('method')
if config.options.track_http_method and httpmethod != '-':
hit.add_page_custom_var('HTTP-method', httpmethod)
except:
pass

try:
hit.query_string = format.get('query_string')
hit.path = hit.full_path
Expand Down Expand Up @@ -2562,6 +2622,22 @@ def filtered_line(line, reason):
invalid_line(line, 'invalid timezone')
continue

site_id, main_url = resolver.resolve(hit)

if config.options.regex_group_to_action_cdims_map:
self._add_custom_dimension_from_regex_groups(site_id, hit, format, config.options.regex_group_to_action_cdims_map, 'action')

if config.options.regex_group_to_visit_cdims_map:
self._add_custom_dimension_from_regex_groups(site_id, hit, format, config.options.regex_group_to_visit_cdims_map, 'visit')

# Add http method page custom dimension
try:
httpmethod = format.get('method')
if config.options.track_http_method and httpmethod != '-':
hit.add_action_custom_dimension(site_id, 'HTTP-method', httpmethod)
except:
pass

if config.options.replay_tracking:
# we need a query string and we only consider requests with piwik.php
if not hit.query_string or not self.is_hit_for_tracker(hit):
Expand Down Expand Up @@ -2620,6 +2696,21 @@ def _add_custom_vars_from_regex_groups(self, hit, format, groups, is_page_var):
else:
hit.add_visit_custom_var(custom_var_name, value)

def _add_custom_dimension_from_regex_groups(self, site_id, hit, format, groups, scope):
for group_name, custom_dim_name in groups.items():
if group_name in format.get_all():
value = format.get(group_name)

# don't track the '-' empty placeholder value
if value == '-':
continue

if scope == 'action':
hit.add_action_custom_dimension(site_id, custom_dim_name, value)
else:
hit.add_visit_custom_dimension(site_id, custom_dim_name, value)


def main():
"""
Start the importing process.
Expand Down Expand Up @@ -2667,6 +2758,7 @@ def fatal_error(error, filename=None, lineno=None):
stats = Statistics()
resolver = config.get_resolver()
parser = Parser()
custom_dimensions = CustomDimensions()
main()
sys.exit(0)
except KeyboardInterrupt:
Expand Down
49 changes: 32 additions & 17 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,8 @@ def __init__(self):
self.w3c_field_regexes = {}
self.regex_group_to_visit_cvars_map = {}
self.regex_group_to_page_cvars_map = {}
self.regex_group_to_visit_cdims_map = {}
self.regex_group_to_action_cdims_map = {}
self.regex_groups_to_ignore = None
self.replay_tracking_expected_tracker_file = 'piwik.php'
self.debug_request_limit = None
Expand All @@ -200,6 +202,9 @@ class Resolver(object):
def check_format(self, format_):
pass

def resolve(self, hit):
return 1, "https://example.org/"

class Recorder(object):
"""Mock recorder which collects hits but doesn't put their in database."""
recorders = []
Expand All @@ -208,6 +213,16 @@ class Recorder(object):
def add_hits(cls, hits):
cls.recorders.extend(hits)

import_logs.custom_dimensions = import_logs.CustomDimensions()
import_logs.custom_dimensions.dimensions[1] = {
('visit', 'User Name'): 1,
('visit', 'The Date'): 2,
('action', 'Generation Time'): 3,
('action', 'The Referrer'): 4,
('action', 'HTTP-method'): 5
}


def test_replay_tracking_seconds_to_add_to_date():
"""Test data parsing from sample log file."""
file_ = 'logs/logs_to_tests.log'
Expand Down Expand Up @@ -478,7 +493,7 @@ def test_iis_custom_format():
assert hits[0]['extension'] == '/products/theproduct'
assert hits[0]['is_download'] == False
assert hits[0]['referrer'] == 'http://example.com/Search/SearchResults.pg?informationRecipient.languageCode.c=en'
assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}}
assert hits[0]['args'] == {'dimension5': 'GET'}
assert hits[0]['generation_time_milli'] == 109
assert hits[0]['host'] == 'foo'
assert hits[0]['filename'] == 'logs/iis_custom.log'
Expand All @@ -497,7 +512,7 @@ def test_iis_custom_format():
assert hits[1]['extension'] == '/topic/hw43061'
assert hits[1]['is_download'] == False
assert hits[1]['referrer'] == ''
assert hits[1]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}}
assert hits[1]['args'] == {'dimension5': 'GET'}
assert hits[1]['generation_time_milli'] == 0
assert hits[1]['host'] == 'foo'
assert hits[1]['filename'] == 'logs/iis_custom.log'
Expand All @@ -516,7 +531,7 @@ def test_iis_custom_format():
assert hits[2]['extension'] == '/hello/world/6,681965'
assert hits[2]['is_download'] == False
assert hits[2]['referrer'] == ''
assert hits[2]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}}
assert hits[2]['args'] == {'dimension5': 'GET'}
assert hits[2]['generation_time_milli'] == 359
assert hits[2]['host'] == 'foo'
assert hits[2]['filename'] == 'logs/iis_custom.log'
Expand Down Expand Up @@ -554,7 +569,7 @@ def test_netscaler_parsing():
assert hits[0]['extension'] == 'jsp'
assert hits[0]['is_download'] == False
assert hits[0]['referrer'] == ''
assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}}
assert hits[0]['args'] == {'dimension5': 'GET'}
assert hits[0]['generation_time_milli'] == 1000
assert hits[0]['host'] == 'foo'
assert hits[0]['filename'] == 'logs/netscaler.log'
Expand Down Expand Up @@ -752,7 +767,7 @@ def test_amazon_cloudfront_web_parsing():
assert hits[0]['extension'] == 'html'
assert hits[0]['is_download'] == False
assert hits[0]['referrer'] == 'https://example.com/'
assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}}
assert hits[0]['args'] == {'dimension5': 'GET'}
assert hits[0]['generation_time_milli'] == 1.0
assert hits[0]['host'] == 'foo'
assert hits[0]['filename'] == 'logs/amazon_cloudfront_web.log'
Expand Down Expand Up @@ -836,7 +851,7 @@ def test_incapsulaw3c_parsing():
assert hits[0]['extension'] == 'php'
assert hits[0]['is_download'] == False
assert hits[0]['referrer'] == u''
assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', u'"GET"']}}
assert hits[0]['args'] == {'dimension5': '"GET"'}
assert hits[0]['length'] == 10117
assert hits[0]['generation_time_milli'] == 0
assert hits[0]['host'] == 'www.example.com'
Expand All @@ -857,7 +872,7 @@ def test_incapsulaw3c_parsing():
assert hits[1]['extension'] == '/rss/news'
assert hits[1]['is_download'] == False
assert hits[1]['referrer'] == u''
assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', u'"GET"']}}
assert hits[0]['args'] == {'dimension5': '"GET"'}
assert hits[1]['length'] == 0
assert hits[1]['generation_time_milli'] == 0
assert hits[1]['host'] == 'www.example.com'
Expand Down Expand Up @@ -982,8 +997,8 @@ def test_ignore_groups_option_removes_groups():
assert hits[0]['userid'] == None
assert hits[0]['generation_time_milli'] == 0

def test_regex_group_to_custom_var_options():
"""Test that the --regex-group-to-visit-cvar and --regex-group-to-page-cvar track regex groups to custom vars."""
def test_regex_group_to_custom_dimensions_options():
"""Test that the --regex-group-to-visit-cdim and --regex-group-to-action-cdim track regex groups to custom vars."""

file_ = 'logs/iis.log'

Expand All @@ -997,22 +1012,22 @@ def test_regex_group_to_custom_var_options():
import_logs.config.options.replay_tracking = False
import_logs.config.options.w3c_time_taken_in_millisecs = True
import_logs.config.options.regex_groups_to_ignore = set()
import_logs.config.options.regex_group_to_visit_cvars_map = {
import_logs.config.options.regex_group_to_visit_cdims_map = {
'userid': "User Name",
'date': "The Date"
}
import_logs.config.options.regex_group_to_page_cvars_map = {
import_logs.config.options.regex_group_to_action_cdims_map = {
'generation_time_milli': 'Generation Time',
'referrer': 'The Referrer'
}
import_logs.parser.parse(file_)

hits = [hit.__dict__ for hit in Recorder.recorders]

assert ['The Date', '2012-04-01 00:00:13'] in hits[0]['args']['_cvar'].values()
assert ['User Name', 'theuser'] in hits[0]['args']['_cvar'].values()
assert ['Generation Time', '1687'] in hits[0]['args']['cvar'].values()
assert ['HTTP-method', 'GET'] in hits[0]['args']['cvar'].values()
assert hits[0]['args']['dimension1'] == 'theuser'
assert hits[0]['args']['dimension2'] == '2012-04-01 00:00:13'
assert hits[0]['args']['dimension3'] == '1687'
assert hits[0]['args']['dimension5'] == 'GET'

assert hits[0]['userid'] == 'theuser'
assert hits[0]['date'] == datetime.datetime(2012, 4, 1, 0, 0, 13)
Expand Down Expand Up @@ -1063,8 +1078,8 @@ def test_custom_log_date_format_option():
Recorder.recorders = []
import_logs.parser = import_logs.Parser()
import_logs.config.options.w3c_field_regexes = None
import_logs.config.options.regex_group_to_visit_cvars_map = None
import_logs.config.options.regex_group_to_page_cvars_map = None
import_logs.config.options.regex_group_to_visit_cdims_map = None
import_logs.config.options.regex_group_to_action_cdims_map = None
import_logs.config.options.log_format_regex = (
r'(?P<ip>\S+)\s+\S+\s+\S+\s+\[(?P<date>.*?)\]\s+'
r'"\S+\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\S+)\s+(?P<length>\S+)'
Expand Down

0 comments on commit 1dd3bd0

Please sign in to comment.