Skip to content

Commit

Permalink
Merge pull request #190 from SebastianMC/178-week-numbers-date-regex-…
Browse files Browse the repository at this point in the history
…patterns

#178 - week-number based date extraction patterns for titles, incl. Www, Www- and Www+ specs
#191 - Explicit support for the common date formats of `yyyy-mm-dd` and `yyyy-dd-mm`
  • Loading branch information
SebastianMC authored Jan 14, 2025
2 parents f9c9c0b + 6e7b2e1 commit c200c2e
Show file tree
Hide file tree
Showing 8 changed files with 621 additions and 33 deletions.
96 changes: 89 additions & 7 deletions src/custom-sort/matchers.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
import {
getDateForWeekOfYear
} from "../utils/week-of-year";

export const RomanNumberRegexStr: string = ' *([MDCLXVI]+)'; // Roman number
export const CompoundRomanNumberDotRegexStr: string = ' *([MDCLXVI]+(?:\\.[MDCLXVI]+)*)';// Compound Roman number with dot as separator
export const CompoundRomanNumberDashRegexStr: string = ' *([MDCLXVI]+(?:-[MDCLXVI]+)*)'; // Compound Roman number with dash as separator
Expand All @@ -6,15 +10,26 @@ export const NumberRegexStr: string = ' *(\\d+)'; // Plain number
export const CompoundNumberDotRegexStr: string = ' *(\\d+(?:\\.\\d+)*)'; // Compound number with dot as separator
export const CompoundNumberDashRegexStr: string = ' *(\\d+(?:-\\d+)*)'; // Compound number with dash as separator

export const Date_yyyy_mm_dd_RegexStr: string = ' *(\\d{4}-[0-3]*[0-9]-[0-3]*[0-9])'
export const Date_yyyy_dd_mm_RegexStr: string = Date_yyyy_mm_dd_RegexStr

export const Date_dd_Mmm_yyyy_RegexStr: string = ' *([0-3]*[0-9]-(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-\\d{4})'; // Date like 01-Jan-2020
export const Date_Mmm_dd_yyyy_RegexStr: string = ' *((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-[0-3]*[0-9]-\\d{4})'; // Date like Jan-01-2020

export const DOT_SEPARATOR = '.'
export const Date_yyyy_Www_mm_dd_RegexStr: string = ' *(\\d{4}-W[0-5]*[0-9] \\([0-3]*[0-9]-[0-3]*[0-9]\\))'
export const Date_yyyy_WwwISO_RegexStr: string = ' *(\\d{4}-W[0-5]*[0-9][-+]?)'
export const Date_yyyy_Www_RegexStr: string = Date_yyyy_WwwISO_RegexStr

export const DOT_SEPARATOR = '.' // ASCII 46
export const DASH_SEPARATOR = '-'

const SLASH_SEPARATOR = '/' // ASCII 47
const SLASH_SEPARATOR = '/' // ASCII 47, right before ASCII 48 = '0'
const GT_SEPARATOR = '>' // ASCII 62, alphabetical sorting in Collator puts it after /
const PIPE_SEPARATOR = '|' // ASCII 124

const EARLIER_THAN_SLASH_SEPARATOR = DOT_SEPARATOR
const LATER_THAN_SLASH_SEPARATOR = GT_SEPARATOR

export const DEFAULT_NORMALIZATION_PLACES = 8; // Fixed width of a normalized number (with leading zeros)

// Property escapes:
Expand Down Expand Up @@ -51,9 +66,9 @@ export function getNormalizedNumber(s: string = '', separator?: string, places?:
// guarantees correct order (/ = ASCII 47, | = ASCII 124)
if (separator) {
const components: Array<string> = s.split(separator).filter(s => s)
return `${components.map((c) => prependWithZeros(c, places ?? DEFAULT_NORMALIZATION_PLACES)).join(PIPE_SEPARATOR)}//`
return `${components.map((c) => prependWithZeros(c, places ?? DEFAULT_NORMALIZATION_PLACES)).join(PIPE_SEPARATOR)}${SLASH_SEPARATOR}${SLASH_SEPARATOR}`
} else {
return `${prependWithZeros(s, places ?? DEFAULT_NORMALIZATION_PLACES)}//`
return `${prependWithZeros(s, places ?? DEFAULT_NORMALIZATION_PLACES)}${SLASH_SEPARATOR}${SLASH_SEPARATOR}`
}
}

Expand Down Expand Up @@ -97,9 +112,9 @@ export function getNormalizedRomanNumber(s: string, separator?: string, places?:
// guarantees correct order (/ = ASCII 47, | = ASCII 124)
if (separator) {
const components: Array<string> = s.split(separator).filter(s => s)
return `${components.map((c) => prependWithZeros(romanToIntStr(c), places ?? DEFAULT_NORMALIZATION_PLACES)).join(PIPE_SEPARATOR)}//`
return `${components.map((c) => prependWithZeros(romanToIntStr(c), places ?? DEFAULT_NORMALIZATION_PLACES)).join(PIPE_SEPARATOR)}${SLASH_SEPARATOR}${SLASH_SEPARATOR}`
} else {
return `${prependWithZeros(romanToIntStr(s), places ?? DEFAULT_NORMALIZATION_PLACES)}//`
return `${prependWithZeros(romanToIntStr(s), places ?? DEFAULT_NORMALIZATION_PLACES)}${SLASH_SEPARATOR}${SLASH_SEPARATOR}`
}
}

Expand All @@ -117,9 +132,76 @@ export function getNormalizedDate_NormalizerFn_for(separator: string, dayIdx: nu
const monthValue = months ? `${1 + MONTHS.indexOf(components[monthIdx])}` : components[monthIdx]
const month = prependWithZeros(monthValue, MONTH_POSITIONS)
const year = prependWithZeros(components[yearIdx], YEAR_POSITIONS)
return `${year}-${month}-${day}//`
return `${year}-${month}-${day}${SLASH_SEPARATOR}${SLASH_SEPARATOR}`
}
}

export const getNormalizedDate_yyyy_mm_dd_NormalizerFn = getNormalizedDate_NormalizerFn_for('-', 2, 1, 0)
export const getNormalizedDate_yyyy_dd_mm_NormalizerFn = getNormalizedDate_NormalizerFn_for('-', 1, 2, 0)
export const getNormalizedDate_dd_Mmm_yyyy_NormalizerFn = getNormalizedDate_NormalizerFn_for('-', 0, 1, 2, MONTHS)
export const getNormalizedDate_Mmm_dd_yyyy_NormalizerFn = getNormalizedDate_NormalizerFn_for('-', 1, 0, 2, MONTHS)

const DateExtractor_orderModifier_earlier_than = '-'
const DateExtractor_orderModifier_later_than = '+'

const DateExtractor_yyyy_Www_mm_dd_Regex = /(\d{4})-W(\d{1,2}) \((\d{2})-(\d{2})\)/
const DateExtractor_yyyy_Www_Regex = /(\d{4})-W(\d{1,2})([-+]?)/

// Matching groups
const YEAR_IDX = 1
const WEEK_IDX = 2
const MONTH_IDX = 3
const DAY_IDX = 4
const RELATIVE_ORDER_IDX = 3 // For the yyyy-Www only: yyyy-Www- or yyyy-Www+

const DECEMBER = 12
const JANUARY = 1

export function getNormalizedDate_NormalizerFn_yyyy_Www_mm_dd(consumeWeek: boolean, weeksISO?: boolean) {
return (s: string): string | null => {
// Assumption - the regex date matched against input s, no extensive defensive coding needed
const matches = consumeWeek ? DateExtractor_yyyy_Www_Regex.exec(s) : DateExtractor_yyyy_Www_mm_dd_Regex.exec(s)
const yearStr = matches![YEAR_IDX]
let yearNumber = Number.parseInt(yearStr,10)
let monthNumber: number
let dayNumber: number
let separator = SLASH_SEPARATOR // different values enforce relative > < order of same dates
let useLastDayOfWeek: boolean = false
if (consumeWeek) {
const weekNumberStr = matches![WEEK_IDX]
const weekNumber = Number.parseInt(weekNumberStr, 10)
const orderModifier: string|undefined = matches![RELATIVE_ORDER_IDX]
if (orderModifier === DateExtractor_orderModifier_earlier_than) {
separator = EARLIER_THAN_SLASH_SEPARATOR
} else if (orderModifier === DateExtractor_orderModifier_later_than) {
separator = LATER_THAN_SLASH_SEPARATOR // Will also need to adjust the date to the last day of the week
useLastDayOfWeek = true
}
const dateForWeek = getDateForWeekOfYear(yearNumber, weekNumber, weeksISO, useLastDayOfWeek)
monthNumber = dateForWeek.getMonth()+1 // 1 - 12
dayNumber = dateForWeek.getDate() // 1 - 31
// Be careful with edge dates, which can belong to previous or next year
if (weekNumber === 1) {
if (monthNumber === DECEMBER) {
yearNumber--
}
}
if (weekNumber >= 50) {
if (monthNumber === JANUARY) {
yearNumber++
}
}
} else { // ignore week
monthNumber = Number.parseInt(matches![MONTH_IDX],10)
dayNumber = Number.parseInt(matches![DAY_IDX], 10)
}
return `${prependWithZeros(`${yearNumber}`, YEAR_POSITIONS)}` +
`-${prependWithZeros(`${monthNumber}`, MONTH_POSITIONS)}` +
`-${prependWithZeros(`${dayNumber}`, DAY_POSITIONS)}` +
`${separator}${SLASH_SEPARATOR}`
}
}

export const getNormalizedDate_yyyy_Www_mm_dd_NormalizerFn = getNormalizedDate_NormalizerFn_yyyy_Www_mm_dd(false)
export const getNormalizedDate_yyyy_WwwISO_NormalizerFn = getNormalizedDate_NormalizerFn_yyyy_Www_mm_dd(true, true)
export const getNormalizedDate_yyyy_Www_NormalizerFn = getNormalizedDate_NormalizerFn_yyyy_Www_mm_dd(true, false)
64 changes: 58 additions & 6 deletions src/custom-sort/sorting-spec-processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,19 @@ import {
DASH_SEPARATOR,
Date_dd_Mmm_yyyy_RegexStr,
Date_Mmm_dd_yyyy_RegexStr,
Date_yyyy_dd_mm_RegexStr,
Date_yyyy_mm_dd_RegexStr,
Date_yyyy_Www_mm_dd_RegexStr,
Date_yyyy_Www_RegexStr,
Date_yyyy_WwwISO_RegexStr,
DOT_SEPARATOR,
getNormalizedDate_dd_Mmm_yyyy_NormalizerFn,
getNormalizedDate_Mmm_dd_yyyy_NormalizerFn,
getNormalizedDate_yyyy_dd_mm_NormalizerFn,
getNormalizedDate_yyyy_mm_dd_NormalizerFn,
getNormalizedDate_yyyy_Www_mm_dd_NormalizerFn,
getNormalizedDate_yyyy_Www_NormalizerFn,
getNormalizedDate_yyyy_WwwISO_NormalizerFn,
getNormalizedNumber,
getNormalizedRomanNumber,
NumberRegexStr,
Expand All @@ -36,10 +46,7 @@ import {
MATCH_CHILDREN_2_SUFFIX,
NO_PRIORITY
} from "./folder-matching-rules"
import {
MDataExtractor,
tryParseAsMDataExtractorSpec
} from "./mdata-extractors";
import {MDataExtractor, tryParseAsMDataExtractorSpec} from "./mdata-extractors";

interface ProcessingContext {
folderPath: string
Expand Down Expand Up @@ -352,8 +359,13 @@ const InlineRegexSymbol_Digit1: string = '\\d'
const InlineRegexSymbol_Digit2: string = '\\[0-9]'
const InlineRegexSymbol_0_to_3: string = '\\[0-3]'

const Date_yyyy_mm_dd_RegexSymbol: string = '\\[yyyy-mm-dd]'
const Date_yyyy_dd_mm_RegexSymbol: string = '\\[yyyy-dd-mm]'
const Date_dd_Mmm_yyyy_RegexSymbol: string = '\\[dd-Mmm-yyyy]'
const Date_Mmm_dd_yyyy_RegexSymbol: string = '\\[Mmm-dd-yyyy]'
const Date_yyyy_Www_mm_dd_RegexSymbol: string = '\\[yyyy-Www (mm-dd)]'
const Date_yyyy_Www_RegexSymbol: string = '\\[yyyy-Www]'
const Date_yyyy_WwwISO_RegexSymbol: string = '\\[yyyy-WwwISO]'

const InlineRegexSymbol_CapitalLetter: string = '\\C'
const InlineRegexSymbol_LowercaseLetter: string = '\\l'
Expand All @@ -373,8 +385,13 @@ const sortingSymbolsArr: Array<string> = [
escapeRegexUnsafeCharacters(CompoundRomanNumberDashRegexSymbol),
escapeRegexUnsafeCharacters(WordInASCIIRegexSymbol),
escapeRegexUnsafeCharacters(WordInAnyLanguageRegexSymbol),
escapeRegexUnsafeCharacters(Date_yyyy_mm_dd_RegexSymbol),
escapeRegexUnsafeCharacters(Date_yyyy_dd_mm_RegexSymbol),
escapeRegexUnsafeCharacters(Date_dd_Mmm_yyyy_RegexSymbol),
escapeRegexUnsafeCharacters(Date_Mmm_dd_yyyy_RegexSymbol)
escapeRegexUnsafeCharacters(Date_Mmm_dd_yyyy_RegexSymbol),
escapeRegexUnsafeCharacters(Date_yyyy_Www_mm_dd_RegexSymbol),
escapeRegexUnsafeCharacters(Date_yyyy_WwwISO_RegexSymbol),
escapeRegexUnsafeCharacters(Date_yyyy_Www_RegexSymbol),
]

const sortingSymbolsRegex = new RegExp(sortingSymbolsArr.join('|'), 'gi')
Expand Down Expand Up @@ -442,8 +459,13 @@ export const CompoundDashRomanNumberNormalizerFn: NormalizerFn = (s: string) =>
export const NumberNormalizerFn: NormalizerFn = (s: string) => getNormalizedNumber(s)
export const CompoundDotNumberNormalizerFn: NormalizerFn = (s: string) => getNormalizedNumber(s, DOT_SEPARATOR)
export const CompoundDashNumberNormalizerFn: NormalizerFn = (s: string) => getNormalizedNumber(s, DASH_SEPARATOR)
export const Date_yyyy_mm_dd_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_yyyy_mm_dd_NormalizerFn(s)
export const Date_yyyy_dd_mm_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_yyyy_dd_mm_NormalizerFn(s)
export const Date_dd_Mmm_yyyy_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_dd_Mmm_yyyy_NormalizerFn(s)
export const Date_Mmm_dd_yyyy_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_Mmm_dd_yyyy_NormalizerFn(s)
export const Date_yyyy_Www_mm_dd_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_yyyy_Www_mm_dd_NormalizerFn(s)
export const Date_yyyy_WwwISO_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_yyyy_WwwISO_NormalizerFn(s)
export const Date_yyyy_Www_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_yyyy_Www_NormalizerFn(s)

export enum AdvancedRegexType {
None, // to allow if (advancedRegex)
Expand All @@ -455,8 +477,13 @@ export enum AdvancedRegexType {
CompoundDashRomanNumber,
WordInASCII,
WordInAnyLanguage,
Date_yyyy_mm_dd,
Date_yyyy_dd_mm,
Date_dd_Mmm_yyyy,
Date_Mmm_dd_yyyy
Date_Mmm_dd_yyyy,
Date_yyyy_Www_mm_dd_yyyy,
Date_yyyy_WwwISO,
Date_yyyy_Www
}

const sortingSymbolToRegexpStr: { [key: string]: RegExpSpecStr } = {
Expand Down Expand Up @@ -501,6 +528,16 @@ const sortingSymbolToRegexpStr: { [key: string]: RegExpSpecStr } = {
advancedRegexType: AdvancedRegexType.WordInAnyLanguage,
unicodeRegex: true
},
[Date_yyyy_mm_dd_RegexSymbol]: { // Intentionally retain character case
regexpStr: Date_yyyy_mm_dd_RegexStr,
normalizerFn: Date_yyyy_mm_dd_NormalizerFn,
advancedRegexType: AdvancedRegexType.Date_yyyy_mm_dd
},
[Date_yyyy_dd_mm_RegexSymbol]: { // Intentionally retain character case
regexpStr: Date_yyyy_dd_mm_RegexStr,
normalizerFn: Date_yyyy_dd_mm_NormalizerFn,
advancedRegexType: AdvancedRegexType.Date_yyyy_dd_mm
},
[Date_dd_Mmm_yyyy_RegexSymbol]: { // Intentionally retain character case
regexpStr: Date_dd_Mmm_yyyy_RegexStr,
normalizerFn: Date_dd_Mmm_yyyy_NormalizerFn,
Expand All @@ -510,6 +547,21 @@ const sortingSymbolToRegexpStr: { [key: string]: RegExpSpecStr } = {
regexpStr: Date_Mmm_dd_yyyy_RegexStr,
normalizerFn: Date_Mmm_dd_yyyy_NormalizerFn,
advancedRegexType: AdvancedRegexType.Date_Mmm_dd_yyyy
},
[Date_yyyy_Www_mm_dd_RegexSymbol]: { // Intentionally retain character case
regexpStr: Date_yyyy_Www_mm_dd_RegexStr,
normalizerFn: Date_yyyy_Www_mm_dd_NormalizerFn,
advancedRegexType: AdvancedRegexType.Date_yyyy_Www_mm_dd_yyyy
},
[Date_yyyy_WwwISO_RegexSymbol]: { // Intentionally retain character case
regexpStr: Date_yyyy_WwwISO_RegexStr,
normalizerFn: Date_yyyy_WwwISO_NormalizerFn,
advancedRegexType: AdvancedRegexType.Date_yyyy_WwwISO
},
[Date_yyyy_Www_RegexSymbol]: { // Intentionally retain character case
regexpStr: Date_yyyy_Www_RegexStr,
normalizerFn: Date_yyyy_Www_NormalizerFn,
advancedRegexType: AdvancedRegexType.Date_yyyy_Www
}
}

Expand Down
Loading

0 comments on commit c200c2e

Please sign in to comment.