version=0.2
[RULE-ORDER]
GENITIVEPLACENAME WORD-WITHSUFFIX WORD-INFIX-COMPOUND QUOTE-SUFFIX
WORD-DIMIN WORD-TOKEN NUMBER-ORDINAL DATE-REVERSE DATE
URL URL-WWW URL-DOMAIN E-MAIL
ABBREVIATION-COMPOUND COMPOUND-ABBREVIATION
QUOTE-COMPOUND WORD-COMPOUND WORD-ILLEGIBLE NUMBER-COMPOUND
#UNIT-COMPOUND
INITIAL
ABBREVIATION
ABBREVIATION-KNOWN
CURRENCY
NUMBER-STRING STRING-NUMBER
WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX
INITIALS
#UNIT
PUNCTUATION-MULTI-DOT
PUNCTUATION-MULTI
NUMBER-YEAR TIME FRACNUMBER NUMBER  WORD PUNCTUATION UNKNOWN
# to do PREFIXES, SUFFIXES (beide leeg nu)
#       UNITS (grotendeels uitgecommentarieerd in de c++ code)

[META-RULES]
SPLITTER=%
NUMBER-ORDINAL = (?>(?:[\.,]?\p{N}+)+)-?(?i)(?: %ORDINALS% )(?:\Z|\P{Lu}|\P{Ll})$

# the ^\p{S} prevents splitting <tag> like strings
ABBREVIATION-KNOWN = (?:\p{P}*)?(?:\A|[^\p{L}\p{S}\.])((?:%ABBREVIATIONS%)(?:\.{0,1}))(?:\Z|\P{L})
WORD-TOKEN =(?i)(%TOKENS%)(?:\p{P}*)?$
WORD-WITHSUFFIX = ((?:\p{L}|\p{N}|-)+(?i)(?: %ATTACHEDSUFFIXES% ))(?:\Z)
WORD-INFIX-COMPOUND = ((?:\p{L}|\p{N}|-)+(?i)(?: %ATTACHEDSUFFIXES% )-(?:\p{L}+))$

CURRENCY=^(\p{Sc}|%CURRENCY%)(?:\p{N}|\Z)

#PREFIX = (?:\A|[^\p{Lu}\.]|[^\p{Ll}\.])(%PREFIXES% )(\p{L}+)
#SUFFIX = ((?:\p{L})+)( %SUFFIXES% )(?:\Z|\P{L})

#UNIT-COMPOUND = \p{N}+((?i: %UNITS% )(?:[./*=]{1})(?i: %UNITS% )(?:\p{P}{0,1}))$
#UNIT = (?i)(?:\a|\P{L})( %UNITS% )(?:\z|\P{L})

[RULES]
%include url
%include e-mail

#Ex (oud)-studente(s)
WORD-PARPREFIX-PARSUFFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{Pd}]?\p{L}+\p{Pe})

#Ex: (oud)-studente, (on)zin,
WORD-PARPREFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*

#Ex: koning(in)
WORD-PARSUFFIX=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{Pd}]?\p{L}+\p{Pe})

#Ex: baby'tje
WORD-DIMIN=[\p{L}\p{N}]+['`’‘´]tjes{0,1}

#Keep dash/underscore/quote connected parts (even if they are in parenthesis)
WORD-COMPOUND=[\p{L}\p{N}]+(?:[\p{Pc}\p{Pd}'`’‘´][\p{L}\p{N}]+)+

#Keep [...] constructions inside words, indicating illegibility of a part or the whole: rege[..]ng
WORD-ILLEGIBLE=[\p{L}\p{N}]*(?:\[\.+\])[\p{L}\p{N}]*

#Keep quote connected parts.
QUOTE-COMPOUND=[\p{L}\p{N}-]+(?:['`’‘´]\p{L}{1})+(?:-[\p{L}\p{N}]+)+

#Abbreviations with multiple periods: a.u.b
ABBREVIATION=^(?:[\{\(\[\<]?)(\p{L}{1,3}(?:\.\p{L}{1,3})+\.?)(?:\Z|[,:;\}\)\]\>])
# a compound of a word and an abbreviation with multiple periods: bouw-c.a.o.
COMPOUND-ABBREVIATION=^(?:\p{L}+-\p{L}{1,3}(?:\.\p{L}{1,3})+\.?)\Z

# a compound of an abbreviation with multiple periods and a word: C.B.S.-data
ABBREVIATION-COMPOUND=^(\p{L}{1,3}(?:\.\p{L}{1,3})+\.?-\p{L}+)

#Initials glued to a longer word: A.F.Zetterij -> A.F. Zetterij
INITIALS=(\p{L}(?:\.\p{L})+\.)\p{Lu}\p{L}{3,999}+

#retain initials
INITIAL=^(?:\p{Lt}|\p{Lu})\.$

#Homogeneous punctuation (ellipsis etc)
PUNCTUATION-MULTI-DOT=(?:\-|_|\*|=|!|<|>|\?){2,}(\.)
PUNCTUATION-MULTI=(?:(?:\-|_|\*|=|!|<|>|\?){2,})|(?:\.){2,}

#Date
DATE=\p{N}{1,2}[-\.]\p{N}{1,2}[-\.]['’‘´\`]?\p{N}{2,4}
DATE-REVERSE=\p{N}{4}[-\.]\p{N}{1,2}[-\.]\p{N}{1,2}

FRACNUMBER=\p{N}+(?:/\p{N}+)+

#tel numberstring (23)3748748 or (+31)9393902
NUMBER-COMPOUND=\(\+{0,1}\p{N}{1,3}\)(?:\p{N}+)

#numberstring like 20jarige
NUMBER-STRING=^\p{N}+(?:\p{Pd}?)(?:\p{L}+)

#combinations like A50, a1
STRING-NUMBER=\p{L}+\p{Pd}?\p{N}+

#abbreviated year: '40
NUMBER-YEAR=(['`’‘´]\p{N}{2})(?:\P{N}|\z)

#Times
TIME=\p{N}{1,2}:\p{N}{1,2}(?::\p{N}{1,2})?(am|pm|AM|PM)?

#retain digits, including those starting with initial period (.22), and negative numbers anf non-spacing markers
NUMBER=-?(?:[\.,]?[\p{N}+|\p{Mn}])+

#'s-Gravenhage, 's-Hertogenbosch
GENITIVEPLACENAME=['`’‘´][sS]-\p{Lu}\p{L}+

WORD=[\p{L}\p{Mn}\-]+

PUNCTUATION=\p{P}

UNKNOWN=.

QUOTE-SUFFIX=(\p{Lu}(?:\p{L})+(?:s['`’‘´]|x['`’‘´]))

[PREFIXES]

[SUFFIXES]

[ATTACHEDSUFFIXES]
\['`’‘´][s]
\['`’‘´][t]
\['`’‘´][n]
\['`’‘´][r]

[ORDINALS]
e
de
ste
er


[TOKENS]
\['`’‘´][s]
\['`’‘´][k]
\['`’‘´][m]
\['`’‘´][n]
\['`’‘´][t]
\['`’‘´](?:ie)

[UNITS]
km
cm
mmol
mm
mol
m
g
kg
C
l
s
sec
min
gb
mb
kb


[CURRENCY]
eur|EUR
hfl
fl
f


[ABBREVIATIONS]
%include nld_afk
&c

[FILTER]
%include ligatures
# also filter soft hyphen
\u00AD

[EOSMARKERS]
%include standard-eos

[QUOTES]
%include standard-quotes
%include exotic-quotes
