GEX thesis source code, full text, references

chrisper.py 26KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585
  1. #!/usr/bin/env python2
  2. import subprocess
  3. import sys
  4. import re
  5. import inspect
  6. import datetime
  7. import string
  8. from collections import defaultdict, Counter
  9. import nltk
  10. try:
  11. nltk.pos_tag('Just trying to see if the NLTK dataset is installed')
  12. except LookupError:
  13. nltk.download('maxent_treebank_pos_tagger')
  14. try:
  15. nltk.word_tokenize('test')
  16. except LookupError:
  17. nltk.download('punkt')
  18. try:
  19. from clint.textui import colored
  20. except:
  21. class Passthrough(object):
  22. def __getattr__(self, name):
  23. return lambda x: x
  24. colored = Passthrough()
  25. print "=== For colored output, install clint (via 'sudo pip install clint') ==="
  26. PREPOSITIONS = ["a", "abaft", "aboard", "about", "above", "absent",
  27. "across", "afore", "after", "against", "along", "alongside", "amid",
  28. "amidst", "among", "amongst", "an", "apropos", "around", "as", "aside",
  29. "astride", "at", "athwart", "atop", "barring", "before", "behind", "below",
  30. "beneath", "beside", "besides", "between", "betwixt", "beyond", "but", "by",
  31. "circa", "concerning", "despite", "down", "during", "except", "excluding",
  32. "failing", "following", "for", "from", "given", "in", "including", "inside",
  33. "into", "lest", "like", "mid", "midst", "minus", "modulo", "near", "next",
  34. "notwithstanding", "of", "off", "on", "onto", "opposite", "out", "outside",
  35. "over", "pace", "past", "per", "plus", "pro", "qua", "regarding", "round",
  36. "sans", "save", "since", "than", "through,", "throughout,", "till", "times",
  37. "to", "toward", "towards", "under", "underneath", "unlike", "until", "unto",
  38. "up", "upon", "versus", "via", "vice", "with", "within", "without",
  39. "worth", "through"]
  40. # Obtained with (avoiding the dependency):
  41. # from nltk.corpus import stopwords
  42. # stopwords.words("english")
  43. STOPWORDS = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',
  44. 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his',
  45. 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they',
  46. 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
  47. 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be',
  48. 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing',
  49. 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while',
  50. 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
  51. 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up',
  52. 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then',
  53. 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both',
  54. 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not',
  55. 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will',
  56. 'just', 'don', 'should', 'now']
  57. CONJUNCTIONS = ["and", "but", "or", "yet", "for", "nor", "so"]
  58. class Paper(object):
  59. ##########################################################################
  60. # INTERNAL STUFF (you can ignore these functions)
  61. ##########################################################################
  62. def __init__(self, filenames):
  63. self.__filenames = filenames
  64. self.errors = 0
  65. self.__clear_caches()
  66. def __clear_caches(self):
  67. self.__text = {}
  68. self.__latex_text = None
  69. @staticmethod
  70. def __flatten_paragraphs(text):
  71. '''Given a text where paragraphs are separated by one or more
  72. empty lines, it puts every paragraph in a single, separate line.
  73. Example:
  74. I like sushi
  75. and pie.
  76. I ride
  77. horses.
  78. Becomes:
  79. I like sushi and pie.
  80. I ride horses '''
  81. return '\n'.join(paragraph.replace('\n', ' ')
  82. for paragraph in re.split("\n(\s*\n)+", text)
  83. if paragraph.strip())
  84. def __run_all_with_prefix(self, prefix):
  85. # Switch filename and clear caches
  86. for filename in self.__filenames:
  87. self.filename = filename
  88. self.__clear_caches()
  89. # Call all functions
  90. for name in filter(lambda n: n.startswith(prefix), dir(self)):
  91. attribute = getattr(self, name)
  92. if inspect.ismethod(attribute):
  93. attribute()
  94. def _run_all_checks(self):
  95. self.__run_all_with_prefix('check_')
  96. def _run_all_tests(self):
  97. self.__filenames = ["TEST"]
  98. self.__run_all_with_prefix('test_')
  99. def _format_re_match(self, m, text):
  100. start_of_sentence = max(text.rfind('\n', 0, m.start()) + 1, 0)
  101. end_of_sentence = text.find('\n', m.end()), len(text)
  102. if end_of_sentence == -1:
  103. end_of_sentence = len(text)
  104. a_string_start = max(start_of_sentence, m.start() - 10)
  105. a_string_end = min(end_of_sentence, m.end() + 10)
  106. a_string = text[a_string_start : m.start()]
  107. a_string += colored.yellow(text[m.start() : m.end()])
  108. a_string += text[m.end() : a_string_end]
  109. to_return = a_string.split('\n', 1)[0]
  110. return to_return.replace('\r',' ').replace('\n',' ')
  111. ##########################################################################
  112. # FUCTIONS THAT ARE RELEVANT FOR CHECKS WRITERS
  113. ##########################################################################
  114. def get_latex_text(self):
  115. ''' Returns the complete paper, with each paragraph on a single
  116. line. No latex/tex command is stripped '''
  117. if self.__latex_text != None:
  118. return self.__latex_text
  119. else:
  120. with open(self.filename) as f:
  121. text = f.read()
  122. self.__latex_text = self.__flatten_paragraphs(text)
  123. return self.__latex_text
  124. _IGNORED_ENVIRONMENTS = ("array",
  125. "eqnarray",
  126. "equation",
  127. "figure",
  128. "mathmatica",
  129. "picture",
  130. "table",
  131. "verbatim",
  132. "lstlisting")
  133. def get_text(self, ignored_environments=None):
  134. ''' Returns the textual content of the tex files, with latex/tex
  135. enviroments stripped. You can control the enviroments to strip via
  136. the 'ignored_environments' argument: if you don't, the default ones
  137. will be stripped'''
  138. if ignored_environments == None:
  139. ignored_environments = Paper._IGNORED_ENVIRONMENTS
  140. try:
  141. return self.__text[ignored_environments]
  142. except:
  143. # Cleanup annoying things
  144. text = self.get_latex_text()
  145. text = re.sub(r'\\cite{[^}]*}', '', text)
  146. text = re.sub(r'\\-', '', text)
  147. # Run it through detex
  148. p = subprocess.Popen(["detex",
  149. "-l",
  150. "-n",
  151. "-e",
  152. ','.join(ignored_environments)],
  153. stdin=subprocess.PIPE,
  154. stdout=subprocess.PIPE)
  155. p.stdin.write(text)
  156. text = p.communicate()[0]
  157. p.stdin.close()
  158. self.__text[ignored_environments] = self.__flatten_paragraphs(
  159. text)
  160. return self.__text[ignored_environments]
  161. def perform_test(self, function, expected_errors_num):
  162. self.errors = 0
  163. function()
  164. assert expected_errors_num == self.errors
  165. def print_issue(self, message, match=None, text=None):
  166. if text == None:
  167. text = self.get_text()
  168. message = colored.red(message)
  169. if match and text:
  170. example = self._format_re_match(match, text)
  171. else:
  172. example = ''
  173. print "%30s - %s: %s" % (colored.green(self.filename),
  174. colored.red(message),
  175. example)
  176. ##########################################################################
  177. # CHECKS
  178. ##########################################################################
  179. def check_exempli_gratia_without_comma(self):
  180. for m in re.finditer(r'e\.g\.[^,]',
  181. self.get_text(),
  182. re.MULTILINE):
  183. self.print_issue("E.G. without comma", m)
  184. self.errors += 1
  185. def test__check_exempli_gratia_without_comma(self):
  186. self.get_text = lambda: "e.g. a pony \n e.g. what?, e.g., cool!"
  187. self.perform_test(self.check_exempli_gratia_without_comma, 2)
  188. ##########################################################################
  189. def check_id_est_without_comma(self):
  190. for m in re.finditer(r'i\.e\.[^,]',
  191. self.get_text(),
  192. re.MULTILINE):
  193. self.print_issue("I.E. without comma", m)
  194. self.errors += 1
  195. def test__check_id_est_without_comma(self):
  196. self.get_text = lambda: "i.e. a pony \n i.e. what?, i.e., cool!"
  197. self.perform_test(self.check_id_est_without_comma, 2)
  198. ##########################################################################
  199. def check_quotes(self):
  200. for m in re.finditer(r'"',
  201. self.get_text(),
  202. re.MULTILINE):
  203. self.print_issue('"hello" should be ``hello\'\'', m)
  204. self.errors += 1
  205. def test__check_quotes(self):
  206. self.get_text = lambda: '"this is not ok" ``but this is\'\''
  207. self.perform_test(self.check_quotes, 2)
  208. ##########################################################################
  209. def check_citing_with_unbreakeable_spaces(self):
  210. for keyword in ["cite", "ref"]:
  211. for m in re.finditer(r'(\W?)(^|\s)+\\' + keyword + r'\s*{',
  212. self.get_latex_text(),
  213. re.MULTILINE):
  214. if m.group(1) in [',', '&']:
  215. continue
  216. self.print_issue('use hello~\%s{ instead of hello \%s{' % (keyword,
  217. keyword),
  218. m, self.get_latex_text())
  219. self.errors += 1
  220. def test__check_citing_with_unbreakeable_spaces(self):
  221. self.get_latex_text = lambda: r'citing: wrong \cite{ciao} - right~\cite{ciao}'
  222. self.perform_test(self.check_citing_with_unbreakeable_spaces, 1)
  223. self.get_latex_text = lambda: r'refs done wrong \ref{ciao} - right~\ref{ciao}'
  224. self.perform_test(self.check_citing_with_unbreakeable_spaces, 1)
  225. self.get_latex_text = lambda: r'hello& \cite{ciao}'
  226. self.perform_test(self.check_citing_with_unbreakeable_spaces, 0)
  227. self.get_latex_text = lambda: r', \ref{ciao}'
  228. self.perform_test(self.check_citing_with_unbreakeable_spaces, 0)
  229. ##########################################################################
  230. def check_variations_of_word_spellings(self):
  231. words = defaultdict(Counter)
  232. for word in re.findall(r'\b\S+\b', self.get_text(), re.MULTILINE):
  233. word_alphanum = re.sub("[^a-zA-Z0-9_']+", '', word).lower()
  234. words[word_alphanum].update([word])
  235. for _, spellings_counter in words.iteritems():
  236. variations = len(spellings_counter.keys())
  237. total_appereances = sum(spellings_counter.values())
  238. if variations > 1:
  239. if len(set(w[1:] for w in spellings_counter.keys())) == 1:
  240. # FIXME: for now, if it's just a case mismatch on the first
  241. # letter, skip
  242. continue
  243. normalized_word = spellings_counter.keys()[0].lower()
  244. if normalized_word in STOPWORDS + PREPOSITIONS + CONJUNCTIONS:
  245. # Ignore common words
  246. continue
  247. # Ignore numbers
  248. try:
  249. float(normalized_word)
  250. continue
  251. except ValueError:
  252. pass
  253. self.print_issue('This word has multiple spellings: %s' % (
  254. dict(spellings_counter)), None)
  255. self.errors += 1
  256. def test__check_variations_of_word_spellings(self):
  257. self.get_text = lambda: (r'I has a cybercriminal. I had a cyber-criminal. '
  258. r'or was it a CyberCriminal?')
  259. self.perform_test(self.check_variations_of_word_spellings, 1)
  260. self.get_text = lambda: (r'no strange words here, however put. '
  261. r'However, is that true?')
  262. self.perform_test(self.check_variations_of_word_spellings, 0)
  263. self.get_text = lambda: (r'It matters the it factor ')
  264. self.perform_test(self.check_variations_of_word_spellings, 0)
  265. self.get_text = lambda: (r'1.6 16 and other fancy numbers.')
  266. self.perform_test(self.check_variations_of_word_spellings, 0)
  267. ##########################################################################
  268. def check_commas_in_numbers(self):
  269. # We also check in tables
  270. text = self.get_text(ignored_environments=tuple(
  271. set(Paper._IGNORED_ENVIRONMENTS) - set('table')))
  272. for m in re.finditer('(^|[^\w\-])\d{4}', text, re.MULTILINE):
  273. if text[m.start():m.start() + 1] in string.punctuation:
  274. continue
  275. try:
  276. number = int(text[m.start():m.end()])
  277. except:
  278. number = 0
  279. if number not in range(1990, datetime.date.today().year + 2):
  280. self.errors += 1
  281. self.print_issue('Put commas in numbers over 1,000', m)
  282. # This is the correct rule, but nobody follows it
  283. # for m in re.finditer('(^|[^\w\-])\d{5}', text, re.MULTILINE):
  284. # self.print_issue('Put commas in numbers over 10,000', m)
  285. # self.errors += 1
  286. # for m in re.finditer('[^\d]\d,\d{3}[^,]', text, re.MULTILINE):
  287. # self.print_issue("Don't put commas in numbers under 10,000", m)
  288. # self.errors += 1
  289. def test__check_commas_in_numbers(self):
  290. def get_text(*args, **kwargs): return text
  291. self.get_text = get_text
  292. text = r'10000 cats eat 10,000 mice'
  293. self.perform_test(self.check_commas_in_numbers, 1)
  294. text = r'9999 cats eat 9,999 mice'
  295. self.perform_test(self.check_commas_in_numbers, 1)
  296. text = r'1000 cats eat 999,999 mice'
  297. self.perform_test(self.check_commas_in_numbers, 1)
  298. text = r'project N10000, grant CNS-20000'
  299. self.perform_test(self.check_commas_in_numbers, 0)
  300. text = r'In 2001, we ate spaghetti'
  301. self.perform_test(self.check_commas_in_numbers, 0)
  302. ##########################################################################
  303. def check_commas_after_quotes(self):
  304. for m in re.finditer("''\s*,",
  305. self.get_text(),
  306. re.MULTILINE):
  307. self.print_issue("Convert ``hello'', => ``hello,''", m)
  308. self.errors += 1
  309. def test__check_commas_after_quotes(self):
  310. self.get_text = lambda: r"``flower'', should be ``flower,''"
  311. self.perform_test(self.check_commas_after_quotes, 1)
  312. ##########################################################################
  313. def check_always_capitalize(self):
  314. for reg in ["internet", "javascript"]:
  315. for m in re.finditer(r"\b{0}".format(reg),
  316. self.get_text(),
  317. re.MULTILINE):
  318. self.print_issue("Always capitalize", m)
  319. self.errors += 1
  320. def test__check_always_capitalize(self):
  321. self.get_text = lambda: r"internet"
  322. self.perform_test(self.check_always_capitalize, 1)
  323. self.get_text = lambda: r"testinternet"
  324. self.perform_test(self.check_always_capitalize, 0)
  325. ##########################################################################
  326. #
  327. # def check_comma_before_that(self):
  328. # for m in re.finditer(",\s+that",
  329. # self.get_text(),
  330. # re.MULTILINE):
  331. # phrase_start = max([
  332. # self.get_text().rfind(c, 0, m.start())
  333. # for c in ['\n', '.', ':', ';']
  334. # ] + [0])
  335. # phrase = self.get_text()[phrase_start:m.start() + 1]
  336. # if len([c for c in phrase if c == ',']) % 2 == 0:
  337. # # An even number of commas found, skipping
  338. # continue
  339. # self.print_issue("Do not put a comma before 'that'", m)
  340. # self.errors += 1
  341. #
  342. # def test__check_comma_before_that(self):
  343. # self.get_text = lambda: r"I like cats, that eat mice"
  344. # self.perform_test(self.check_comma_before_that, 1)
  345. # self.get_text = lambda: r"I like cats that eat mice"
  346. # self.perform_test(self.check_comma_before_that, 0)
  347. #
  348. # ##########################################################################
  349. #
  350. # def check_comma_before_which(self):
  351. # for m in re.finditer("[^,'*\s]\s+which",
  352. # self.get_text(),
  353. # re.MULTILINE):
  354. # word_before_start = self.get_text().rfind(' ', 0, m.start())
  355. # word_before = re.search("\w+", self.get_text()[
  356. # word_before_start + 1:
  357. # m.start() + 1]).group()
  358. # if word_before in PREPOSITIONS + CONJUNCTIONS:
  359. # continue
  360. # if word_before.endswith('ing') or word_before.endswith('ly'):
  361. # continue
  362. # # More expensive analysis: is the word before a verb?
  363. # phrase_start = max([
  364. # self.get_text().rfind(c, 0, m.start())
  365. # for c in ['\n', '.', ':', ';']
  366. # ] + [0])
  367. # phrase = self.get_text()[phrase_start:m.start() + 1]
  368. # word_before_kind = filter(lambda x: x[0] == word_before, nltk.pos_tag(
  369. # nltk.word_tokenize(phrase)))[0][1]
  370. # if word_before_kind.startswith('VB'):
  371. # continue
  372. # self.print_issue("Put a comma before 'which'", m)
  373. # self.errors += 1
  374. #
  375. # def test__check_comma_before_which(self):
  376. # self.get_text = lambda: r"I like that cat, which eat mice"
  377. # self.perform_test(self.check_comma_before_which, 0)
  378. # self.get_text = lambda: r"I like that cat which eat mice"
  379. # self.perform_test(self.check_comma_before_which, 1)
  380. # self.get_text = lambda: r"I know which cat eat mice"
  381. # self.perform_test(self.check_comma_before_which, 0)
  382. ##########################################################################
  383. def check_word_before_ref_is_capitalized(self):
  384. text = self.get_latex_text()
  385. for m in re.finditer('\\\\ref', text, re.MULTILINE):
  386. word_before_start = max(text.rfind(' ', 0, m.start() - 2),
  387. text.rfind('~', 0, m.start() - 2))
  388. word_before = re.findall("\w+", text[word_before_start + 1:
  389. m.start() + 1])[-1]
  390. if not word_before in ["and"] and not word_before[0].isupper():
  391. self.print_issue(r'Capitalize the word before \ref', m, text)
  392. self.errors += 1
  393. def test__check_word_before_ref_is_capitalized(self):
  394. self.get_latex_text = lambda: "in Section \\ref{sec}, see Figure \\ref{fig}"
  395. self.perform_test(self.check_word_before_ref_is_capitalized, 0)
  396. self.get_latex_text = lambda: "in section \\ref{sec}, see figure \\ref{fig}"
  397. self.perform_test(self.check_word_before_ref_is_capitalized, 2)
  398. self.get_latex_text = lambda: "section \\ref{sec}"
  399. self.perform_test(self.check_word_before_ref_is_capitalized, 1)
  400. ##########################################################################
  401. def check_british_spelling(self):
  402. british_spellings = {"acknowledgement": "acknowledgment", "afterwards": "afterward", "arse": "ass", "backwards": "backward",
  403. "cancelling": "canceling", "catalogue": "catalog", "centre": "center", "cheque": "check", "colour": "color", "dialogue": "dialog",
  404. "favour": "favor", "flavour": "flavor", "forwards": "forward", "grey": "gray", "judgement": "judgment", "labour": "labor",
  405. "lustre": "luster", "modelled": "modeled", "revelled": "raveled", "shovelled": "shoveled", "snivelled": "sniveled",
  406. "theatre": "theater", "towards": "toward", "travelling": "traveling", "yodelling": "yodeling"}
  407. for british, american in british_spellings.iteritems():
  408. for m in re.finditer("[^\w]+" + british + "[^\w]+", self.get_text()):
  409. self.print_issue("Don't spell like a bugger (that's british english) -" \
  410. " it's ' "+ american + " ' , not", m)
  411. self.errors += 1
  412. def test__check_british_spelling(self):
  413. self.get_text = lambda: r"Go to the (centre) of town to pick up the best flavour colour."
  414. self.perform_test(self.check_british_spelling, 3)
  415. self.get_text = lambda: r"I am an American, therefore I am"
  416. self.perform_test(self.check_british_spelling, 0)
  417. self.get_text = lambda: r"This cheque, right here, is unacceptable. I'll have to cheque with my manager."
  418. self.perform_test(self.check_british_spelling, 2)
  419. self.get_text = lambda: r"It is hard to parse this sentence."
  420. self.perform_test(self.check_british_spelling, 0)
  421. ##########################################################################
  422. def check_slang_and_gergal_terms(self):
  423. gergal = ['basically']
  424. for w in gergal:
  425. for m in re.finditer(w, self.get_text(),
  426. re.IGNORECASE):
  427. self.print_issue(
  428. "This word doesn't sound like it should be in a paper: "
  429. + w, m)
  430. self.errors += 1
  431. def test__check_slang_and_gergal_terms(self):
  432. self.get_text = lambda: r"Basically, this is wat we do"
  433. self.perform_test(self.check_slang_and_gergal_terms, 1)
  434. ##########################################################################
  435. def check_misplelled_standard_phrases(self):
  436. mispellings = {"in more details": "in more detail"}
  437. for wrong, right in mispellings.iteritems():
  438. for m in re.finditer("[^\w]+" + wrong + "[^\w]+", self.get_text()):
  439. self.print_issue("Mispelled standard phrase - ' %s ' "
  440. "should be ' %s' in" % (wrong, right), m)
  441. self.errors += 1
  442. def test__check_misplelled_standard_phrases(self):
  443. self.get_text = lambda: r"I'll discuss this in more details in section"
  444. self.perform_test(self.check_misplelled_standard_phrases, 1)
  445. ##########################################################################
  446. def check_banned_words(self):
  447. banned_words = ["is[\s]+comprised[\s]+of",
  448. "doesn't",
  449. "beside",
  450. "won't",
  451. "can't"]
  452. for banned_word in banned_words:
  453. for m in re.finditer("([^\w]+|^)" + banned_word + "[^\w]+",
  454. self.get_text(), flags=re.IGNORECASE):
  455. self.print_issue("Don't use %s" % banned_word, m)
  456. self.errors += 1
  457. def test__check_banned_words(self):
  458. self.get_text = lambda: r"Adam is comprised of a brain and a stomach."
  459. self.perform_test(self.check_banned_words, 1)
  460. self.get_text = lambda: r"Adam comprises a brain and a stomach."
  461. self.perform_test(self.check_banned_words, 0)
  462. self.get_text = lambda: r"You don't know what that is. Comprised. Of."
  463. self.perform_test(self.check_banned_words, 0)
  464. self.get_text = lambda: r"Is comprised of blah and bloop."
  465. self.perform_test(self.check_banned_words, 1)
  466. self.get_text = lambda: r"Adam is awesome. Is comprised of blah and bloop."
  467. self.perform_test(self.check_banned_words, 1)
  468. self.get_text = lambda: r"Don't do this. I won't, tell anybody."
  469. self.perform_test(self.check_banned_words, 1)
  470. ##########################################################################
  471. def check_repeated_words(self):
  472. for m in re.finditer(r"\b(\w+)\W+\1\b",
  473. self.get_text(), flags=re.IGNORECASE):
  474. if m.group(1).isdigit():
  475. continue
  476. self.print_issue("Repeated word '%s'" % m.group(1), m)
  477. self.errors += 1
  478. def test__check_repeated_words(self):
  479. self.get_text = lambda: r"We use this this and that."
  480. self.perform_test(self.check_repeated_words, 1)
  481. self.get_text = lambda: r"We use this and that, and this and that too."
  482. self.perform_test(self.check_repeated_words, 0)
  483. self.get_text = lambda: r"This. This is a sentence Sentence."
  484. self.perform_test(self.check_repeated_words, 2)
  485. self.get_text = lambda: r"Version 4.4."
  486. self.perform_test(self.check_repeated_words, 0)
  487. if __name__ == '__main__':
  488. if len(sys.argv) < 2:
  489. print """Usage:
  490. - chrisper *.tex
  491. - chrisper test
  492. Runs the test suite.
  493. """
  494. sys.exit(0)
  495. if sys.argv[1] == "test":
  496. Paper(sys.argv[2:])._run_all_tests()
  497. print colored.green("\n=== ALL TESTS PASSED ===")
  498. else:
  499. paper = Paper(sys.argv[1:])
  500. paper._run_all_checks()
  501. if paper.errors == 0:
  502. print colored.green('=== IT LOOKS GOOD TO ME. CONGRATS! ===')
  503. else:
  504. print colored.yellow("\n=== I'VE FOUND %d ISSUES ===" %
  505. paper.errors)
  506. sys.exit(1)