version=0.2
[RULE-ORDER]
PREFIX SUFFIX WORD-TOKEN URL URL-WWW URL-DOMAIN
E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND
ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE
NUMBER-YEAR FRACNUMBER TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN

[META-RULES]
SPLITTER=%
#ABBREVIATION-KNOWN = (?:\p{P}*)?(?:\A|[^\p{L}\.])(?:\A)((?:%ABBREVIATIONS%)(?:\.{0,1}))(?:\Z|\P{L})
WORD-TOKEN =(%TOKENS%)(?:\p{P}*)?$
PREFIX = (?:\A|[^\p{Lu}\.]|[^\p{Ll}\.])(%PREFIXES% )\p{L}+
SUFFIX = (?:\A|\p{L})+( %SUFFIXES% )(?:\Z|\P{L})

[RULES]
%include url
%include e-mail
%include smiley

#Ex: (dis)information
WORD-PARPREFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*

#Ex: understand(s)
WORD-PARSUFFIX=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{Pd}]?\p{L}+\p{Pe})

#Keep dash/underscore connected parts (even if they are in parenthesis)
WORD-COMPOUND=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)+

#Abbreviations with multiple periods
ABBREVIATION=^(?:[\{\(\[\<]?)(\p{L}{1,3}(?:\.\p{L}{1,3})+\.?)(?:\Z|[,:;\}\)\]\>])

#retain initials
INITIAL=^(?:\p{Lt}|\p{Lu})\.$

#Homogeneous punctuation (ellipsis etc)
PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}

#Date
DATE=\p{N}{1,2}[/\-]\p{N}{1,2}[/-]\p{N}{2,4}
DATE-REVERSE=\p{N}{4}[/\-]\p{N}{1,2}[/\-]\p{N}{1,2}

FRACNUMBER=\p{N}+(?:/\p{N}+)+

NUMBER-YEAR=(['`’‘´]\p{N}{2})(?:\P{N}|\z)

#Times
TIME=\p{N}{1,2}:\p{N}{1,2}(?::\p{N})?(?i:am|pm)?

#retain digits, including those starting with initial period (.22), and negative numbers
NUMBER=-?(?:[\.,]?\p{N}+)+

CURRENCY=\p{Sc}

WORD=[\p{L}\p{Mn}]+

PUNCTUATION=\p{P}

UNKNOWN=.

[PREFIXES]
(?:qu|Qu|QU)['`’‘´]
\[lL]['`’‘´]
\[dD]['`’‘´]
\[mM]['`’‘´]
\[nN]['`’‘´]
\[tT]['`’‘´]
\[sS]['`’‘´]
\[cC]['`’‘´]
\[jJ]['`’‘´]

[SUFFIXES]
-je|-JE
-me|-ME
-tu|-TU
-te|-TE
-[tT]
-il|-IL
-se|-SE
-lui|-LUI
-elle|-ELLE
-nous|-NOUS
-vous|-VOUS
-ils|-ILS
-eux|-EUX
-elles|-ELLES
-ce|-CE
-ci|-CI
-là|-LÀ
-la|-LA
-le|-LE
-les|-LES

[TOKENS]
\[lL]['`’‘´]
\[dD]['`’‘´]
\[mM]['`’‘´]
\[nN]['`’‘´]
\[tT]['`’‘´]
\[sS]['`’‘´]
\[cC]['`’‘´]
\[jJ]['`’‘´]
\[aA]ujourd['`’‘´]hui
AUJOURD['`’‘´]HUI
(?:qu|Qu|QU)['`’‘´]

[UNITS]
km
m
cm
mm
g
kg
C
l
s
sec
min
gb
mb
kb


[CURRENCY]
EUR
CAD

[ABBREVIATIONS]
%include fra

[FILTER]
ﬂ fl
ﬀ ff
ﬃ ffi
ﬄ ffl
# also filter soft hyphen
\u00AD


[EOSMARKERS]
%include standard-eos

[QUOTES]
%include standard-quotes
