import re def _get_link_pattern(): # regexp meta characters are: . ^ $ * + ? { } [ ] \ | ( ) # one escapes the metachars with \ # \S matches anything but ' ' '\t' '\n' '\r' '\f' and '\v' # \s matches any whitespace character # \w any alphanumeric character # \W any non-alphanumeric character # \b means word boundary. This is a zero-width assertion that # matches only at the beginning or end of a word. # ^ matches at the beginning of lines # # * means 0 or more times # + means 1 or more times # ? means 0 or 1 time # | means or # [^*] anything but '*' (inside [] you don't have to escape metachars) # [^\s*] anything but whitespaces and '*' # (? in the matching string don't match ? or ) etc.. if at # the end # so http://be) will match http://be and http://be)be) will match # http://be)be legacy_prefixes = r"((?<=\()(www|ftp)\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$"\ r"&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+(?=\)))"\ r"|((www|ftp)\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]"\ r"|%[A-Fa-f0-9]{2})+"\ r"\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+)" # NOTE: it's ok to catch www.gr such stuff exist! # FIXME: recognize xmpp: and treat it specially links = r"((?<=\()[A-Za-z][A-Za-z0-9\+\.\-]*:"\ r"([\w\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+"\ r"(?=\)))|(\w[\w\+\.\-]*:([^<>\s]|%[A-Fa-f0-9]{2})+)" # 2nd one: at_least_one_char@at_least_one_char.at_least_one_char mail = r'\bmailto:\S*[^\s\W]|' r'\b\S+@\S+\.\S*[^\s\W]' link_pattern = links + '|' + mail + '|' + legacy_prefixes return link_pattern def _get_basic_pattern(): basic_pattern = _get_link_pattern() # detects eg. *b* *bold* *bold bold* test *bold* *bold*! (*bold*) # doesn't detect (it's a feature :P) * bold* *bold * * bold * test*bold* formatting = r'|(?