author Adrien Di Mascio <>
Tue, 28 Nov 2017 18:15:10 +0100
changeset 518 18c42a345591
parent 375 343a4304a259
permissions -rw-r--r--
[aligner] safety belt to avoid crash on empty sets There are several parts in the code that assume that at least refset is not empty (e.g. log_infos() to compute the alignment progress)

# -*- coding: utf-8 -*-
Stopwords in different languages.

FRENCH_STOPWORDS = set(['alors', 'au', 'aucuns', 'aussi', 'autre', 'aux', 'avant', 'avec', 'avoir', 'bon', 'car', 'ce', 'cela', 'ces', 'ceux', 'chaque', 'ci', 'comme', 'comment', 'dans', 'de', 'dedans', 'dehors', 'depuis', 'des', 'deux', 'devrait', 'doit', 'donc', 'dos', 'droite', 'du', 'début', 'elle', 'elles', 'en', 'encore', 'essai', 'est', 'et', 'eu', 'eux', 'fait', 'faites', 'fois', 'font', 'force', 'haut', 'hors', 'ici', 'il', 'ils', 'je', 'juste', 'la', 'le', 'les', 'leur', 'lui', 'là', 'ma', 'maintenant', 'mais', 'me', 'meme', 'mes', 'mine', 'moi', 'moins', 'mon', 'mot', 'ne', 'ni', 'nommés', 'nos', 'notre', 'nous', 'nouveaux', 'on', 'ou', 'où', 'par', 'parce', 'parole', 'pas', 'personnes', 'peu', 'peut', 'pièce', 'plupart', 'pour', 'pourquoi', 'qu', 'quand', 'que', 'quel', 'quelle', 'quelles', 'quels', 'qui', 'sa', 'sans', 'se', 'ses', 'seulement', 'si', 'sien', 'son', 'sont', 'sous', 'soyez', 'sujet', 'sur', 'ta', 'tandis', 'te', 'tellement', 'tels', 'tes', 'toi', 'ton', 'tous', 'tout', 'trop', 'très', 'tu', 'un', 'une', 'valeur', 'voie', 'voient', 'vont', 'vos', 'votre', 'vous', 'vu', 'ça', 'étaient', 'état', 'étions', 'été', 'être'])

ENGLISH_STOPWORDS = set(['a', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', "aren't", 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'bill', 'both', 'bottom', 'brief', 'but', 'by', 'call', 'came', 'can', 'cannot', 'cant', "can't", 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'computer', 'con', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'couldnt', "couldn't", 'course', 'cry', 'currently', "c'mon", "c's", 'de', 'definitely', 'describe', 'described', 'despite', 'detail', 'did', "didn't", 'different', 'do', 'does', "doesn't", 'doing', 'done', "don't", 'down', 'downwards', 'due', 'during', 'each', 'edu', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'far', 'few', 'fifteen', 'fifth', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'furthermore', 'get', 'gets', 'getting', 'give', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'had', "hadn't", 'happens', 'hardly', 'has', 'hasnt', "hasn't", 'have', "haven't", 'having', 'he', 'hello', 'help', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', "here's", 'hers', 'herself', "he's", 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'hundred', 'i', "i'd", "i'll", "i'm", "i've", 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'interest', 'into', 'inward', 'is', "isn't", 'it', 'its', 'itself', "it'd", "it'll", "it's'", "i'd", "i'll", "i'm", "i've", 'just', 'keep', 'keeps', 'kept', 'know', 'known', 'knows', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', "let's", 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'made', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'part', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'put', 'que', 'quite', 'qv', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', "shouldn't", 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 'system', 'take', 'taken', 'tell', 'ten', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', "that's", 'thats', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'thick', 'thin', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'top', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twelve', 'twenty', 'twice', 'two', "t's", 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants', 'was', "wasn't", 'way', 'we', 'welcome', 'well', 'went', 'were', "weren't", "we'd", "we'll", "we're", "we've", 'what', 'whatever', "what's", 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', "where's", 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', "who's", 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wonder', "won't", 'would', "wouldn't", 'yes', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', "you'd", "you'll", "you're", "you've", 'zero'])

ENGLISH_REGULAR_VERBS = set(['accept', 'add', 'admire', 'admit', 'advise', 'afford', 'agree', 'alert', 'allow', 'amuse', 'analyse', 'announce', 'annoy', 'answer', 'apologise', 'appear', 'applaud', 'appreciate', 'approve', 'argue', 'arrange', 'arrest', 'arrive', 'ask', 'attach', 'attack', 'attempt', 'attend', 'attract', 'avoid', 'back', 'bake', 'balance', 'ban', 'bang', 'bare', 'bat', 'bathe', 'battle', 'beam', 'beg', 'behave', 'belong', 'bleach', 'bless', 'blind', 'blink', 'blot', 'blush', 'boast', 'boil', 'bolt', 'bomb', 'book', 'bore', 'borrow', 'bounce', 'bow', 'box', 'brake', 'branch', 'breathe', 'bruise', 'brush', 'bubble', 'bump', 'burn', 'bury', 'buzz', 'calculate', 'call', 'camp', 'care', 'carry', 'carve', 'cause', 'challenge', 'change', 'charge', 'chase', 'cheat', 'check', 'cheer', 'chew', 'choke', 'chop', 'claim', 'clap', 'clean', 'clear', 'clip', 'close', 'coach', 'coil', 'collect', 'colour', 'comb', 'command', 'communicate', 'compare', 'compete', 'complain', 'complete', 'concentrate', 'concern', 'confess', 'confuse', 'connect', 'consider', 'consist', 'contain', 'continue', 'copy', 'correct', 'cough', 'count', 'cover', 'crack', 'crash', 'crawl', 'cross', 'crush', 'cry', 'cure', 'curl', 'curve', 'cycle', 'dam', 'damage', 'dance', 'dare', 'decay', 'deceive', 'decide', 'decorate', 'delay', 'delight', 'deliver', 'depend', 'describe', 'desert', 'deserve', 'destroy', 'detect', 'develop', 'disagree', 'disappear', 'disapprove', 'disarm', 'discover', 'dislike', 'divide', 'double', 'doubt', 'drag', 'drain', 'dream', 'dress', 'drip', 'drop', 'drown', 'drum', 'dry', 'dust', 'earn', 'educate', 'embarrass', 'employ', 'empty', 'encourage', 'end', 'enjoy', 'enter', 'entertain', 'escape', 'examine', 'excite', 'excuse', 'exercise', 'exist', 'expand', 'expect', 'explain', 'explode', 'extend', 'face', 'fade', 'fail', 'fancy', 'fasten', 'fax', 'fear', 'fence', 'fetch', 'file', 'fill', 'film', 'fire', 'fit', 'fix', 'flap', 'flash', 'float', 'flood', 'flow', 'flower', 'fold', 'follow', 'fool', 'force', 'form', 'found', 'frame', 'frighten', 'fry', 'gather', 'gaze', 'glow', 'glue', 'grab', 'grate', 'grease', 'greet', 'grin', 'grip', 'groan', 'guarantee', 'guard', 'guess', 'guide', 'hammer', 'hand', 'handle', 'hang', 'happen', 'harass', 'harm', 'hate', 'haunt', 'head', 'heal', 'heap', 'heat', 'help', 'hook', 'hop', 'hope', 'hover', 'hug', 'hum', 'hunt', 'hurry', 'identify', 'ignore', 'imagine', 'impress', 'improve', 'include', 'increase', 'influence', 'inform', 'inject', 'injure', 'instruct', 'intend', 'interest', 'interfere', 'interrupt', 'introduce', 'invent', 'invite', 'irritate', 'itch', 'jail', 'jam', 'jog', 'join', 'joke', 'judge', 'juggle', 'jump', 'kick', 'kill', 'kiss', 'kneel', 'knit', 'knock', 'knot', 'label', 'land', 'last', 'laugh', 'launch', 'learn', 'level', 'license', 'lick', 'lie', 'lighten', 'like', 'list', 'listen', 'live', 'load', 'lock', 'long', 'look', 'love', 'man', 'manage', 'march', 'mark', 'marry', 'match', 'mate', 'matter', 'measure', 'meddle', 'melt', 'memorise', 'mend', 'mess up', 'milk', 'mine', 'miss', 'mix', 'moan', 'moor', 'mourn', 'move', 'muddle', 'mug', 'multiply', 'murder', 'nail', 'name', 'need', 'nest', 'nod', 'note', 'notice', 'number', 'obey', 'object', 'observe', 'obtain', 'occur', 'offend', 'offer', 'open', 'order', 'overflow', 'owe', 'own', 'pack', 'paddle', 'paint', 'park', 'part', 'pass', 'paste', 'pat', 'pause', 'peck', 'pedal', 'peel', 'peep', 'perform', 'permit', 'phone', 'pick', 'pinch', 'pine', 'place', 'plan', 'plant', 'play', 'please', 'plug', 'point', 'poke', 'polish', 'pop', 'possess', 'post', 'pour', 'practise', 'pray', 'preach', 'precede', 'prefer', 'prepare', 'present', 'preserve', 'press', 'pretend', 'prevent', 'prick', 'print', 'produce', 'program', 'promise', 'protect', 'provide', 'pull', 'pump', 'punch', 'puncture', 'punish', 'push', 'question', 'queue', 'race', 'radiate', 'rain', 'raise', 'reach', 'realise', 'receive', 'recognise', 'record', 'reduce', 'reflect', 'refuse', 'regret', 'reign', 'reject', 'rejoice', 'relax', 'release', 'rely', 'remain', 'remember', 'remind', 'remove', 'repair', 'repeat', 'replace', 'reply', 'report', 'reproduce', 'request', 'rescue', 'retire', 'return', 'rhyme', 'rinse', 'risk', 'rob', 'rock', 'roll', 'rot', 'rub', 'ruin', 'rule', 'rush', 'sack', 'sail', 'satisfy', 'save', 'saw', 'scare', 'scatter', 'scold', 'scorch', 'scrape', 'scratch', 'scream', 'screw', 'scribble', 'scrub', 'seal', 'search', 'separate', 'serve', 'settle', 'shade', 'share', 'shave', 'shelter', 'shiver', 'shock', 'shop', 'shrug', 'sigh', 'sign', 'signal', 'sin', 'sip', 'ski', 'skip', 'slap', 'slip', 'slow', 'smash', 'smell', 'smile', 'smoke', 'snatch', 'sneeze', 'sniff', 'snore', 'snow', 'soak', 'soothe', 'sound', 'spare', 'spark', 'sparkle', 'spell', 'spill', 'spoil', 'spot', 'spray', 'sprout', 'squash', 'squeak', 'squeal', 'squeeze', 'stain', 'stamp', 'stare', 'start', 'stay', 'steer', 'step', 'stir', 'stitch', 'stop', 'store', 'strap', 'strengthen', 'stretch', 'strip', 'stroke', 'stuff', 'subtract', 'succeed', 'suck', 'suffer', 'suggest', 'suit', 'supply', 'support', 'suppose', 'surprise', 'surround', 'suspect', 'suspend', 'switch', 'talk', 'tame', 'tap', 'taste', 'tease', 'telephone', 'tempt', 'terrify', 'test', 'thank', 'thaw', 'tick', 'tickle', 'tie', 'time', 'tip', 'tire', 'touch', 'tour', 'tow', 'trace', 'trade', 'train', 'transport', 'trap', 'travel', 'treat', 'tremble', 'trick', 'trip', 'trot', 'trouble', 'trust', 'try', 'tug', 'tumble', 'turn', 'twist', 'type', 'undress', 'unfasten', 'unite', 'unlock', 'unpack', 'untidy', 'use', 'vanish', 'visit', 'wail', 'wait', 'walk', 'wander', 'want', 'warm', 'warn', 'wash', 'waste', 'watch', 'water', 'wave', 'weigh', 'welcome', 'whine', 'whip', 'whirl', 'whisper', 'whistle', 'wink', 'wipe', 'wish', 'wobble', 'wonder', 'work', 'worry', 'wrap', 'wreck', 'wrestle', 'wriggle', 'x-ray', 'yawn', 'yell', 'zip', 'zoom'])

ENGLISH_IRREGULAR_VERBS = set(['arise ', 'arisen', 'arose ', 'ate', 'awake', 'awakened', 'awoke', 'awoken', 'backslid', 'backslidden', 'backslide', 'bade', 'be', 'bear', 'beat', 'beaten', 'became', 'become', 'been', 'began', 'begin', 'begun', 'bend', 'bent', 'bet', 'betted', 'bid', 'bidden', 'bind', 'bit', 'bite', 'bitten', 'bled', 'bleed', 'blew', 'blow', 'blown', 'bore', 'born', 'borne', 'bought', 'bound', 'break', 'bred', 'breed', 'bring', 'broadcast', 'broadcasted', 'broke', 'broken', 'brought', 'build', 'built', 'burn', 'burned', 'burnt', 'burst', 'bust', 'busted', 'buy', 'came', 'cast', 'catch', 'caught', 'choose', 'chose', 'chosen', 'clad', 'cling', 'clothe', 'clothed', 'clung', 'come', 'cost', 'creep', 'crept', 'cut', 'daydream', 'daydreamed', 'daydreamt', 'deal', 'dealt', 'did', 'dig', 'disprove', 'disproved', 'disproven', 'dive', 'dived', 'do', 'done', 'dove', 'drank', 'draw', 'drawn', 'dream', 'dreamed', 'dreamt', 'drew', 'drink', 'drive', 'driven', 'drove', 'drunk', 'dug', 'dwell', 'dwelled', 'dwelt', 'eat', 'eaten', 'fall', 'fallen', 'fed', 'feed', 'feel', 'fell', 'felt', 'fight', 'find', 'fit', 'fitted', 'fled', 'flee', 'flew', 'fling', 'flown', 'flung', 'fly', 'forbade', 'forbid', 'forbidden', 'forecast', 'forego', 'foregone', 'foresaw', 'foresee', 'foreseen', 'foretell', 'foretold', 'forewent', 'forgave', 'forget', 'forgive', 'forgiven', 'forgot', 'forgotten', 'forsake', 'forsaken', 'forsook', 'fought', 'found', 'freeze', 'froze', 'frozen', 'gave', 'get', 'give', 'given', 'go', 'gone', 'got', 'gotten', 'grew', 'grind', 'ground', 'grow', 'grown', 'had', 'hang', 'have', 'hear', 'heard', 'held', 'hew', 'hewed', 'hewn', 'hid', 'hidden', 'hide', 'hit', 'hold', 'hung', 'hurt', 'keep', 'kept', 'kneel', 'kneeled', 'knelt', 'knew', 'knit', 'knitted', 'know', 'known', 'laid', 'lain', 'lay', 'lead', 'lean', 'leaned', 'leant', 'leap', 'leaped', 'leapt', 'learn', 'learned', 'learnt', 'leave', 'led', 'left', 'lend', 'lent', 'let', 'lie', 'lied', 'light', 'lighted', 'lit', 'lose', 'lost', 'made', 'make', 'mean', 'meant', 'meet', 'met', 'misunderstand', 'misunderstood', 'mow', 'mowed', 'mown', 'paid', 'partake', 'partaken', 'partook', 'pay', 'plead', 'pleaded', 'pled', 'proofread', 'prove', 'proved', 'proven', 'put', 'quick-freeze', 'quick-froze', 'quick-frozen', 'quit', 'quitted', 'ran', 'rang', 'read', 'rid', 'ridden', 'ride', 'ring', 'rise', 'risen', 'rode', 'rose', 'run', 'rung', 'said', 'sang', 'sank', 'sat', 'saw', 'sawed', 'sawn', 'say', 'see', 'seek', 'seen', 'sell', 'send', 'sent', 'set', 'sew', 'sewed', 'sewn', 'shake', 'shaken', 'shave', 'shaved', 'shaven', 'shear', 'sheared', 'shed', 'shine', 'shined', 'shone', 'shook', 'shoot', 'shorn', 'shot', 'show', 'showed', 'shown', 'shrank', 'shrink', 'shrunk', 'shut', 'sing', 'sink', 'sit', 'slain', 'slay', 'slayed', 'sleep', 'slept', 'slew', 'slid', 'slide', 'sling', 'slink', 'slinked', 'slit', 'slung', 'slunk', 'smell', 'smelled', 'smelt', 'sneak', 'sneaked', 'snuck', 'sold', 'sought', 'sow', 'sowed', 'sown', 'spat', 'speak', 'sped', 'speed', 'speeded', 'spell', 'spelled', 'spelt', 'spend', 'spent', 'spill', 'spilled', 'spilt', 'spin', 'spit', 'split', 'spoil', 'spoiled', 'spoilt', 'spoke', 'spoken', 'sprang', 'spread', 'spring', 'sprung', 'spun', 'stand ', 'stank', 'steal', 'stick', 'sting', 'stink', 'stole', 'stolen', 'stood', 'strew', 'strewed', 'strewn', 'stricken', 'stridden', 'stride', 'strike', 'string', 'strive', 'strived', 'striven', 'strode', 'strove', 'struck', 'strung', 'stuck', 'stung', 'stunk', 'sublet', 'sunburn', 'sunburned', 'sunburnt', 'sung', 'sunk', 'swam', 'swear', 'sweat', 'sweated', 'sweep', 'swell', 'swelled', 'swept', 'swim', 'swing', 'swollen', 'swore', 'sworn', 'swum', 'swung', 'take', 'taken', 'taught', 'teach', 'tear', 'telecast', 'tell', 'test-drive', 'test-driven', 'test-drove', 'test-flew', 'test-flown', 'test-fly', 'think', 'thought', 'threw', 'throw', 'thrown', 'thrust', 'told', 'took', 'tore', 'torn', 'tread', 'trod', 'trodden', 'understand', 'understood', 'undertake', 'undertaken', 'undertook', 'undid', 'undo', 'undone', 'wake', 'waked', 'was, were', 'waylaid', 'waylay', 'wear', 'weave', 'weaved', 'wed', 'wedded', 'weep', 'went', 'wept', 'wet', 'wetted', 'whet', 'whetted', 'win', 'wind', 'withdraw', 'withdrawn', 'withdrew', 'withheld', 'withhold', 'withstand', 'withstood', 'woke', 'woken', 'won', 'wore', 'worn', 'wound', 'wove', 'woven', 'wring', 'write', 'written', 'wrote', 'wrung'])