# Natural Language Toolkit: Regexp Chunk Parser Application # # Copyright (C) 2001-2019 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ A graphical tool for exploring the regular expression based chunk parser ``nltk.chunk.RegexpChunkParser``. """ # Todo: Add a way to select the development set from the menubar. This # might just need to be a selection box (conll vs treebank etc) plus # configuration parameters to select what's being chunked (eg VP vs NP) # and what part of the data is being used as the development set. from __future__ import division import time import textwrap import re import random from six.moves.tkinter import ( Button, Canvas, Checkbutton, Frame, IntVar, Label, Menu, Scrollbar, Text, Tk, ) from six.moves.tkinter_tkfiledialog import askopenfilename, asksaveasfilename from six.moves.tkinter_font import Font from nltk.tree import Tree from nltk.util import in_idle from nltk.draw.util import ShowText from nltk.corpus import conll2000, treebank_chunk from nltk.chunk import ChunkScore, RegexpChunkParser from nltk.chunk.regexp import RegexpChunkRule class RegexpChunkApp(object): """ A graphical tool for exploring the regular expression based chunk parser ``nltk.chunk.RegexpChunkParser``. See ``HELP`` for instructional text. """ ##///////////////////////////////////////////////////////////////// ## Help Text ##///////////////////////////////////////////////////////////////// #: A dictionary mapping from part of speech tags to descriptions, #: which is used in the help text. (This should probably live with #: the conll and/or treebank corpus instead.) TAGSET = { 'CC': 'Coordinating conjunction', 'PRP$': 'Possessive pronoun', 'CD': 'Cardinal number', 'RB': 'Adverb', 'DT': 'Determiner', 'RBR': 'Adverb, comparative', 'EX': 'Existential there', 'RBS': 'Adverb, superlative', 'FW': 'Foreign word', 'RP': 'Particle', 'JJ': 'Adjective', 'TO': 'to', 'JJR': 'Adjective, comparative', 'UH': 'Interjection', 'JJS': 'Adjective, superlative', 'VB': 'Verb, base form', 'LS': 'List item marker', 'VBD': 'Verb, past tense', 'MD': 'Modal', 'NNS': 'Noun, plural', 'NN': 'Noun, singular or masps', 'VBN': 'Verb, past participle', 'VBZ': 'Verb,3rd ps. sing. present', 'NNP': 'Proper noun, singular', 'NNPS': 'Proper noun plural', 'WDT': 'wh-determiner', 'PDT': 'Predeterminer', 'WP': 'wh-pronoun', 'POS': 'Possessive ending', 'WP$': 'Possessive wh-pronoun', 'PRP': 'Personal pronoun', 'WRB': 'wh-adverb', '(': 'open parenthesis', ')': 'close parenthesis', '``': 'open quote', ',': 'comma', "''": 'close quote', '.': 'period', '#': 'pound sign (currency marker)', '$': 'dollar sign (currency marker)', 'IN': 'Preposition/subord. conjunction', 'SYM': 'Symbol (mathematical or scientific)', 'VBG': 'Verb, gerund/present participle', 'VBP': 'Verb, non-3rd ps. sing. present', ':': 'colon', } #: Contents for the help box. This is a list of tuples, one for #: each help page, where each tuple has four elements: #: - A title (displayed as a tab) #: - A string description of tabstops (see Tkinter.Text for details) #: - The text contents for the help page. You can use expressions #: like ... to colorize the text; see ``HELP_AUTOTAG`` #: for a list of tags you can use for colorizing. HELP = [ ( 'Help', '20', "Welcome to the regular expression chunk-parser grammar editor. " "You can use this editor to develop and test chunk parser grammars " "based on NLTK's RegexpChunkParser class.\n\n" # Help box. "Use this box ('Help') to learn more about the editor; click on the " "tabs for help on specific topics:" "\n" "Rules: grammar rule types\n" "Regexps: regular expression syntax\n" "Tags: part of speech tags\n\n" # Grammar. "Use the upper-left box ('Grammar') to edit your grammar. " "Each line of your grammar specifies a single 'rule', " "which performs an action such as creating a chunk or merging " "two chunks.\n\n" # Dev set. "The lower-left box ('Development Set') runs your grammar on the " "development set, and displays the results. " "Your grammar's chunks are highlighted, and " "the correct (gold standard) chunks are " "underlined. If they " "match, they are displayed in green; otherwise, " "they are displayed in red. The box displays a single " "sentence from the development set at a time; use the scrollbar or " "the next/previous buttons view additional sentences.\n\n" # Performance "The lower-right box ('Evaluation') tracks the performance of " "your grammar on the development set. The 'precision' axis " "indicates how many of your grammar's chunks are correct; and " "the 'recall' axis indicates how many of the gold standard " "chunks your system generated. Typically, you should try to " "design a grammar that scores high on both metrics. The " "exact precision and recall of the current grammar, as well " "as their harmonic mean (the 'f-score'), are displayed in " "the status bar at the bottom of the window.", ), ( 'Rules', '10', "

{...regexp...}

" "\nChunk rule: creates new chunks from words matching " "regexp.\n\n" "

}...regexp...{

" "\nChink rule: removes words matching regexp from existing " "chunks.\n\n" "

...regexp1...}{...regexp2...

" "\nSplit rule: splits chunks that match regexp1 followed by " "regexp2 in two.\n\n" "

...regexp...{}...regexp...

" "\nMerge rule: joins consecutive chunks that match regexp1 " "and regexp2\n", ), ( 'Regexps', '10 60', # "Regular Expression Syntax Summary:\n\n" "

Pattern\t\tMatches...

\n" "" "\t<T>\ta word with tag T " "(where T may be a regexp).\n" "\tx?\tan optional x\n" "\tx+\ta sequence of 1 or more x's\n" "\tx*\ta sequence of 0 or more x's\n" "\tx|y\tx or y\n" "\t.\tmatches any character\n" "\t(x)\tTreats x as a group\n" "\t# x...\tTreats x... " "(to the end of the line) as a comment\n" "\t\\C\tmatches character C " "(useful when C is a special character " "like + or #)\n" "" "\n

Examples:

\n" "" '\t\n' '\t\tMatches "cow/NN"\n' '\t\tMatches "green/NN"\n' '\t\n' '\t\tMatches "eating/VBG"\n' '\t\tMatches "ate/VBD"\n' '\t

\n' '\t\tMatches "on/IN the/DT car/NN"\n' '\t?\n' '\t\tMatches "ran/VBD"\n' '\t\tMatches "slowly/RB ate/VBD"\n' '\t<\#> # This is a comment...\n' '\t\tMatches "#/# 100/CD"\n' "", ), ( 'Tags', '10 60', "

Part of Speech Tags:

\n" + '' + '<>' + '\n', # this gets auto-substituted w/ self.TAGSET ), ] HELP_AUTOTAG = [ ('red', dict(foreground='#a00')), ('green', dict(foreground='#080')), ('highlight', dict(background='#ddd')), ('underline', dict(underline=True)), ('h1', dict(underline=True)), ('indent', dict(lmargin1=20, lmargin2=20)), ('hangindent', dict(lmargin1=0, lmargin2=60)), ('var', dict(foreground='#88f')), ('regexp', dict(foreground='#ba7')), ('match', dict(foreground='#6a6')), ] ##///////////////////////////////////////////////////////////////// ## Config Parmeters ##///////////////////////////////////////////////////////////////// _EVAL_DELAY = 1 """If the user has not pressed any key for this amount of time (in seconds), and the current grammar has not been evaluated, then the eval demon will evaluate it.""" _EVAL_CHUNK = 15 """The number of sentences that should be evaluated by the eval demon each time it runs.""" _EVAL_FREQ = 0.2 """The frequency (in seconds) at which the eval demon is run""" _EVAL_DEMON_MIN = 0.02 """The minimum amount of time that the eval demon should take each time it runs -- if it takes less than this time, _EVAL_CHUNK will be modified upwards.""" _EVAL_DEMON_MAX = 0.04 """The maximum amount of time that the eval demon should take each time it runs -- if it takes more than this time, _EVAL_CHUNK will be modified downwards.""" _GRAMMARBOX_PARAMS = dict( width=40, height=12, background='#efe', highlightbackground='#efe', highlightthickness=1, relief='groove', border=2, wrap='word', ) _HELPBOX_PARAMS = dict( width=15, height=15, background='#efe', highlightbackground='#efe', foreground='#555', highlightthickness=1, relief='groove', border=2, wrap='word', ) _DEVSETBOX_PARAMS = dict( width=70, height=10, background='#eef', highlightbackground='#eef', highlightthickness=1, relief='groove', border=2, wrap='word', tabs=(30,), ) _STATUS_PARAMS = dict(background='#9bb', relief='groove', border=2) _FONT_PARAMS = dict(family='helvetica', size=-20) _FRAME_PARAMS = dict(background='#777', padx=2, pady=2, border=3) _EVALBOX_PARAMS = dict( background='#eef', highlightbackground='#eef', highlightthickness=1, relief='groove', border=2, width=300, height=280, ) _BUTTON_PARAMS = dict( background='#777', activebackground='#777', highlightbackground='#777' ) _HELPTAB_BG_COLOR = '#aba' _HELPTAB_FG_COLOR = '#efe' _HELPTAB_FG_PARAMS = dict(background='#efe') _HELPTAB_BG_PARAMS = dict(background='#aba') _HELPTAB_SPACER = 6 def normalize_grammar(self, grammar): # Strip comments grammar = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', grammar) # Normalize whitespace grammar = re.sub(' +', ' ', grammar) grammar = re.sub('\n\s+', '\n', grammar) grammar = grammar.strip() # [xx] Hack: automatically backslash $! grammar = re.sub(r'([^\\])\$', r'\1\\$', grammar) return grammar def __init__( self, devset_name='conll2000', devset=None, grammar='', chunk_label='NP', tagset=None, ): """ :param devset_name: The name of the development set; used for display & for save files. If either the name 'treebank' or the name 'conll2000' is used, and devset is None, then devset will be set automatically. :param devset: A list of chunked sentences :param grammar: The initial grammar to display. :param tagset: Dictionary from tags to string descriptions, used for the help page. Defaults to ``self.TAGSET``. """ self._chunk_label = chunk_label if tagset is None: tagset = self.TAGSET self.tagset = tagset # Named development sets: if devset is None: if devset_name == 'conll2000': devset = conll2000.chunked_sents('train.txt') # [:100] elif devset == 'treebank': devset = treebank_chunk.chunked_sents() # [:100] else: raise ValueError('Unknown development set %s' % devset_name) self.chunker = None """The chunker built from the grammar string""" self.grammar = grammar """The unparsed grammar string""" self.normalized_grammar = None """A normalized version of ``self.grammar``.""" self.grammar_changed = 0 """The last time() that the grammar was changed.""" self.devset = devset """The development set -- a list of chunked sentences.""" self.devset_name = devset_name """The name of the development set (for save files).""" self.devset_index = -1 """The index into the development set of the first instance that's currently being viewed.""" self._last_keypress = 0 """The time() when a key was most recently pressed""" self._history = [] """A list of (grammar, precision, recall, fscore) tuples for grammars that the user has already tried.""" self._history_index = 0 """When the user is scrolling through previous grammars, this is used to keep track of which grammar they're looking at.""" self._eval_grammar = None """The grammar that is being currently evaluated by the eval demon.""" self._eval_normalized_grammar = None """A normalized copy of ``_eval_grammar``.""" self._eval_index = 0 """The index of the next sentence in the development set that should be looked at by the eval demon.""" self._eval_score = ChunkScore(chunk_label=chunk_label) """The ``ChunkScore`` object that's used to keep track of the score of the current grammar on the development set.""" # Set up the main window. top = self.top = Tk() top.geometry('+50+50') top.title('Regexp Chunk Parser App') top.bind('', self.destroy) # Varaible that restricts how much of the devset we look at. self._devset_size = IntVar(top) self._devset_size.set(100) # Set up all the tkinter widgets self._init_fonts(top) self._init_widgets(top) self._init_bindings(top) self._init_menubar(top) self.grammarbox.focus() # If a grammar was given, then display it. if grammar: self.grammarbox.insert('end', grammar + '\n') self.grammarbox.mark_set('insert', '1.0') # Display the first item in the development set self.show_devset(0) self.update() def _init_bindings(self, top): top.bind('', self._devset_next) top.bind('', self._devset_prev) top.bind('', self.toggle_show_trace) top.bind('', self.update) top.bind('', lambda e: self.save_grammar()) top.bind('', lambda e: self.load_grammar()) self.grammarbox.bind('', self.toggle_show_trace) self.grammarbox.bind('', self._devset_next) self.grammarbox.bind('', self._devset_prev) # Redraw the eval graph when the window size changes self.evalbox.bind('', self._eval_plot) def _init_fonts(self, top): # TWhat's our font size (default=same as sysfont) self._size = IntVar(top) self._size.set(20) self._font = Font(family='helvetica', size=-self._size.get()) self._smallfont = Font( family='helvetica', size=-(int(self._size.get() * 14 // 20)) ) def _init_menubar(self, parent): menubar = Menu(parent) filemenu = Menu(menubar, tearoff=0) filemenu.add_command(label='Reset Application', underline=0, command=self.reset) filemenu.add_command( label='Save Current Grammar', underline=0, accelerator='Ctrl-s', command=self.save_grammar, ) filemenu.add_command( label='Load Grammar', underline=0, accelerator='Ctrl-o', command=self.load_grammar, ) filemenu.add_command( label='Save Grammar History', underline=13, command=self.save_history ) filemenu.add_command( label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-q' ) menubar.add_cascade(label='File', underline=0, menu=filemenu) viewmenu = Menu(menubar, tearoff=0) viewmenu.add_radiobutton( label='Tiny', variable=self._size, underline=0, value=10, command=self.resize, ) viewmenu.add_radiobutton( label='Small', variable=self._size, underline=0, value=16, command=self.resize, ) viewmenu.add_radiobutton( label='Medium', variable=self._size, underline=0, value=20, command=self.resize, ) viewmenu.add_radiobutton( label='Large', variable=self._size, underline=0, value=24, command=self.resize, ) viewmenu.add_radiobutton( label='Huge', variable=self._size, underline=0, value=34, command=self.resize, ) menubar.add_cascade(label='View', underline=0, menu=viewmenu) devsetmenu = Menu(menubar, tearoff=0) devsetmenu.add_radiobutton( label='50 sentences', variable=self._devset_size, value=50, command=self.set_devset_size, ) devsetmenu.add_radiobutton( label='100 sentences', variable=self._devset_size, value=100, command=self.set_devset_size, ) devsetmenu.add_radiobutton( label='200 sentences', variable=self._devset_size, value=200, command=self.set_devset_size, ) devsetmenu.add_radiobutton( label='500 sentences', variable=self._devset_size, value=500, command=self.set_devset_size, ) menubar.add_cascade(label='Development-Set', underline=0, menu=devsetmenu) helpmenu = Menu(menubar, tearoff=0) helpmenu.add_command(label='About', underline=0, command=self.about) menubar.add_cascade(label='Help', underline=0, menu=helpmenu) parent.config(menu=menubar) def toggle_show_trace(self, *e): if self._showing_trace: self.show_devset() else: self.show_trace() return 'break' _SCALE_N = 5 # center on the last 5 examples. _DRAW_LINES = False def _eval_plot(self, *e, **config): width = config.get('width', self.evalbox.winfo_width()) height = config.get('height', self.evalbox.winfo_height()) # Clear the canvas self.evalbox.delete('all') # Draw the precision & recall labels. tag = self.evalbox.create_text( 10, height // 2 - 10, justify='left', anchor='w', text='Precision' ) left, right = self.evalbox.bbox(tag)[2] + 5, width - 10 tag = self.evalbox.create_text( left + (width - left) // 2, height - 10, anchor='s', text='Recall', justify='center', ) top, bot = 10, self.evalbox.bbox(tag)[1] - 10 # Draw masks for clipping the plot. bg = self._EVALBOX_PARAMS['background'] self.evalbox.lower( self.evalbox.create_rectangle(0, 0, left - 1, 5000, fill=bg, outline=bg) ) self.evalbox.lower( self.evalbox.create_rectangle(0, bot + 1, 5000, 5000, fill=bg, outline=bg) ) # Calculate the plot's scale. if self._autoscale.get() and len(self._history) > 1: max_precision = max_recall = 0 min_precision = min_recall = 1 for i in range(1, min(len(self._history), self._SCALE_N + 1)): grammar, precision, recall, fmeasure = self._history[-i] min_precision = min(precision, min_precision) min_recall = min(recall, min_recall) max_precision = max(precision, max_precision) max_recall = max(recall, max_recall) # if max_precision-min_precision > max_recall-min_recall: # min_recall -= (max_precision-min_precision)/2 # max_recall += (max_precision-min_precision)/2 # else: # min_precision -= (max_recall-min_recall)/2 # max_precision += (max_recall-min_recall)/2 # if min_recall < 0: # max_recall -= min_recall # min_recall = 0 # if min_precision < 0: # max_precision -= min_precision # min_precision = 0 min_precision = max(min_precision - 0.01, 0) min_recall = max(min_recall - 0.01, 0) max_precision = min(max_precision + 0.01, 1) max_recall = min(max_recall + 0.01, 1) else: min_precision = min_recall = 0 max_precision = max_recall = 1 # Draw the axis lines & grid lines for i in range(11): x = left + (right - left) * ( (i / 10.0 - min_recall) / (max_recall - min_recall) ) y = bot - (bot - top) * ( (i / 10.0 - min_precision) / (max_precision - min_precision) ) if left < x < right: self.evalbox.create_line(x, top, x, bot, fill='#888') if top < y < bot: self.evalbox.create_line(left, y, right, y, fill='#888') self.evalbox.create_line(left, top, left, bot) self.evalbox.create_line(left, bot, right, bot) # Display the plot's scale self.evalbox.create_text( left - 3, bot, justify='right', anchor='se', text='%d%%' % (100 * min_precision), ) self.evalbox.create_text( left - 3, top, justify='right', anchor='ne', text='%d%%' % (100 * max_precision), ) self.evalbox.create_text( left, bot + 3, justify='center', anchor='nw', text='%d%%' % (100 * min_recall), ) self.evalbox.create_text( right, bot + 3, justify='center', anchor='ne', text='%d%%' % (100 * max_recall), ) # Display the scores. prev_x = prev_y = None for i, (_, precision, recall, fscore) in enumerate(self._history): x = left + (right - left) * ( (recall - min_recall) / (max_recall - min_recall) ) y = bot - (bot - top) * ( (precision - min_precision) / (max_precision - min_precision) ) if i == self._history_index: self.evalbox.create_oval( x - 2, y - 2, x + 2, y + 2, fill='#0f0', outline='#000' ) self.status['text'] = ( 'Precision: %.2f%%\t' % (precision * 100) + 'Recall: %.2f%%\t' % (recall * 100) + 'F-score: %.2f%%' % (fscore * 100) ) else: self.evalbox.lower( self.evalbox.create_oval( x - 2, y - 2, x + 2, y + 2, fill='#afa', outline='#8c8' ) ) if prev_x is not None and self._eval_lines.get(): self.evalbox.lower( self.evalbox.create_line(prev_x, prev_y, x, y, fill='#8c8') ) prev_x, prev_y = x, y _eval_demon_running = False def _eval_demon(self): if self.top is None: return if self.chunker is None: self._eval_demon_running = False return # Note our starting time. t0 = time.time() # If are still typing, then wait for them to finish. if ( time.time() - self._last_keypress < self._EVAL_DELAY and self.normalized_grammar != self._eval_normalized_grammar ): self._eval_demon_running = True return self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon) # If the grammar changed, restart the evaluation. if self.normalized_grammar != self._eval_normalized_grammar: # Check if we've seen this grammar already. If so, then # just use the old evaluation values. for (g, p, r, f) in self._history: if self.normalized_grammar == self.normalize_grammar(g): self._history.append((g, p, r, f)) self._history_index = len(self._history) - 1 self._eval_plot() self._eval_demon_running = False self._eval_normalized_grammar = None return self._eval_index = 0 self._eval_score = ChunkScore(chunk_label=self._chunk_label) self._eval_grammar = self.grammar self._eval_normalized_grammar = self.normalized_grammar # If the grammar is empty, the don't bother evaluating it, or # recording it in history -- the score will just be 0. if self.normalized_grammar.strip() == '': # self._eval_index = self._devset_size.get() self._eval_demon_running = False return # Score the next set of examples for gold in self.devset[ self._eval_index : min( self._eval_index + self._EVAL_CHUNK, self._devset_size.get() ) ]: guess = self._chunkparse(gold.leaves()) self._eval_score.score(gold, guess) # update our index in the devset. self._eval_index += self._EVAL_CHUNK # Check if we're done if self._eval_index >= self._devset_size.get(): self._history.append( ( self._eval_grammar, self._eval_score.precision(), self._eval_score.recall(), self._eval_score.f_measure(), ) ) self._history_index = len(self._history) - 1 self._eval_plot() self._eval_demon_running = False self._eval_normalized_grammar = None else: progress = 100 * self._eval_index / self._devset_size.get() self.status['text'] = 'Evaluating on Development Set (%d%%)' % progress self._eval_demon_running = True self._adaptively_modify_eval_chunk(time.time() - t0) self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon) def _adaptively_modify_eval_chunk(self, t): """ Modify _EVAL_CHUNK to try to keep the amount of time that the eval demon takes between _EVAL_DEMON_MIN and _EVAL_DEMON_MAX. :param t: The amount of time that the eval demon took. """ if t > self._EVAL_DEMON_MAX and self._EVAL_CHUNK > 5: self._EVAL_CHUNK = min( self._EVAL_CHUNK - 1, max( int(self._EVAL_CHUNK * (self._EVAL_DEMON_MAX / t)), self._EVAL_CHUNK - 10, ), ) elif t < self._EVAL_DEMON_MIN: self._EVAL_CHUNK = max( self._EVAL_CHUNK + 1, min( int(self._EVAL_CHUNK * (self._EVAL_DEMON_MIN / t)), self._EVAL_CHUNK + 10, ), ) def _init_widgets(self, top): frame0 = Frame(top, **self._FRAME_PARAMS) frame0.grid_columnconfigure(0, weight=4) frame0.grid_columnconfigure(3, weight=2) frame0.grid_rowconfigure(1, weight=1) frame0.grid_rowconfigure(5, weight=1) # The grammar self.grammarbox = Text(frame0, font=self._font, **self._GRAMMARBOX_PARAMS) self.grammarlabel = Label( frame0, font=self._font, text='Grammar:', highlightcolor='black', background=self._GRAMMARBOX_PARAMS['background'], ) self.grammarlabel.grid(column=0, row=0, sticky='SW') self.grammarbox.grid(column=0, row=1, sticky='NEWS') # Scroll bar for grammar grammar_scrollbar = Scrollbar(frame0, command=self.grammarbox.yview) grammar_scrollbar.grid(column=1, row=1, sticky='NWS') self.grammarbox.config(yscrollcommand=grammar_scrollbar.set) # grammar buttons bg = self._FRAME_PARAMS['background'] frame3 = Frame(frame0, background=bg) frame3.grid(column=0, row=2, sticky='EW') Button( frame3, text='Prev Grammar', command=self._history_prev, **self._BUTTON_PARAMS ).pack(side='left') Button( frame3, text='Next Grammar', command=self._history_next, **self._BUTTON_PARAMS ).pack(side='left') # Help box self.helpbox = Text(frame0, font=self._smallfont, **self._HELPBOX_PARAMS) self.helpbox.grid(column=3, row=1, sticky='NEWS') self.helptabs = {} bg = self._FRAME_PARAMS['background'] helptab_frame = Frame(frame0, background=bg) helptab_frame.grid(column=3, row=0, sticky='SW') for i, (tab, tabstops, text) in enumerate(self.HELP): label = Label(helptab_frame, text=tab, font=self._smallfont) label.grid(column=i * 2, row=0, sticky='S') # help_frame.grid_columnconfigure(i, weight=1) # label.pack(side='left') label.bind('', lambda e, tab=tab: self.show_help(tab)) self.helptabs[tab] = label Frame( helptab_frame, height=1, width=self._HELPTAB_SPACER, background=bg ).grid(column=i * 2 + 1, row=0) self.helptabs[self.HELP[0][0]].configure(font=self._font) self.helpbox.tag_config('elide', elide=True) for (tag, params) in self.HELP_AUTOTAG: self.helpbox.tag_config('tag-%s' % tag, **params) self.show_help(self.HELP[0][0]) # Scroll bar for helpbox help_scrollbar = Scrollbar(frame0, command=self.helpbox.yview) self.helpbox.config(yscrollcommand=help_scrollbar.set) help_scrollbar.grid(column=4, row=1, sticky='NWS') # The dev set frame4 = Frame(frame0, background=self._FRAME_PARAMS['background']) self.devsetbox = Text(frame4, font=self._font, **self._DEVSETBOX_PARAMS) self.devsetbox.pack(expand=True, fill='both') self.devsetlabel = Label( frame0, font=self._font, text='Development Set:', justify='right', background=self._DEVSETBOX_PARAMS['background'], ) self.devsetlabel.grid(column=0, row=4, sticky='SW') frame4.grid(column=0, row=5, sticky='NEWS') # dev set scrollbars self.devset_scroll = Scrollbar(frame0, command=self._devset_scroll) self.devset_scroll.grid(column=1, row=5, sticky='NWS') self.devset_xscroll = Scrollbar( frame4, command=self.devsetbox.xview, orient='horiz' ) self.devsetbox['xscrollcommand'] = self.devset_xscroll.set self.devset_xscroll.pack(side='bottom', fill='x') # dev set buttons bg = self._FRAME_PARAMS['background'] frame1 = Frame(frame0, background=bg) frame1.grid(column=0, row=7, sticky='EW') Button( frame1, text='Prev Example (Ctrl-p)', command=self._devset_prev, **self._BUTTON_PARAMS ).pack(side='left') Button( frame1, text='Next Example (Ctrl-n)', command=self._devset_next, **self._BUTTON_PARAMS ).pack(side='left') self.devset_button = Button( frame1, text='Show example', command=self.show_devset, state='disabled', **self._BUTTON_PARAMS ) self.devset_button.pack(side='right') self.trace_button = Button( frame1, text='Show trace', command=self.show_trace, **self._BUTTON_PARAMS ) self.trace_button.pack(side='right') # evaluation box self.evalbox = Canvas(frame0, **self._EVALBOX_PARAMS) label = Label( frame0, font=self._font, text='Evaluation:', justify='right', background=self._EVALBOX_PARAMS['background'], ) label.grid(column=3, row=4, sticky='SW') self.evalbox.grid(column=3, row=5, sticky='NEWS', columnspan=2) # evaluation box buttons bg = self._FRAME_PARAMS['background'] frame2 = Frame(frame0, background=bg) frame2.grid(column=3, row=7, sticky='EW') self._autoscale = IntVar(self.top) self._autoscale.set(False) Checkbutton( frame2, variable=self._autoscale, command=self._eval_plot, text='Zoom', **self._BUTTON_PARAMS ).pack(side='left') self._eval_lines = IntVar(self.top) self._eval_lines.set(False) Checkbutton( frame2, variable=self._eval_lines, command=self._eval_plot, text='Lines', **self._BUTTON_PARAMS ).pack(side='left') Button(frame2, text='History', **self._BUTTON_PARAMS).pack(side='right') # The status label self.status = Label(frame0, font=self._font, **self._STATUS_PARAMS) self.status.grid(column=0, row=9, sticky='NEW', padx=3, pady=2, columnspan=5) # Help box & devset box can't be edited. self.helpbox['state'] = 'disabled' self.devsetbox['state'] = 'disabled' # Spacers bg = self._FRAME_PARAMS['background'] Frame(frame0, height=10, width=0, background=bg).grid(column=0, row=3) Frame(frame0, height=0, width=10, background=bg).grid(column=2, row=0) Frame(frame0, height=6, width=0, background=bg).grid(column=0, row=8) # pack the frame. frame0.pack(fill='both', expand=True) # Set up colors for the devset box self.devsetbox.tag_config('true-pos', background='#afa', underline='True') self.devsetbox.tag_config('false-neg', underline='True', foreground='#800') self.devsetbox.tag_config('false-pos', background='#faa') self.devsetbox.tag_config('trace', foreground='#666', wrap='none') self.devsetbox.tag_config('wrapindent', lmargin2=30, wrap='none') self.devsetbox.tag_config('error', foreground='#800') # And for the grammarbox self.grammarbox.tag_config('error', background='#fec') self.grammarbox.tag_config('comment', foreground='#840') self.grammarbox.tag_config('angle', foreground='#00f') self.grammarbox.tag_config('brace', foreground='#0a0') self.grammarbox.tag_config('hangindent', lmargin1=0, lmargin2=40) _showing_trace = False def show_trace(self, *e): self._showing_trace = True self.trace_button['state'] = 'disabled' self.devset_button['state'] = 'normal' self.devsetbox['state'] = 'normal' # self.devsetbox['wrap'] = 'none' self.devsetbox.delete('1.0', 'end') self.devsetlabel['text'] = 'Development Set (%d/%d)' % ( (self.devset_index + 1, self._devset_size.get()) ) if self.chunker is None: self.devsetbox.insert('1.0', 'Trace: waiting for a valid grammar.') self.devsetbox.tag_add('error', '1.0', 'end') return # can't do anything more gold_tree = self.devset[self.devset_index] rules = self.chunker.rules() # Calculate the tag sequence tagseq = '\t' charnum = [1] for wordnum, (word, pos) in enumerate(gold_tree.leaves()): tagseq += '%s ' % pos charnum.append(len(tagseq)) self.charnum = dict( ((i, j), charnum[j]) for i in range(len(rules) + 1) for j in range(len(charnum)) ) self.linenum = dict((i, i * 2 + 2) for i in range(len(rules) + 1)) for i in range(len(rules) + 1): if i == 0: self.devsetbox.insert('end', 'Start:\n') self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c') else: self.devsetbox.insert('end', 'Apply %s:\n' % rules[i - 1]) self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c') # Display the tag sequence. self.devsetbox.insert('end', tagseq + '\n') self.devsetbox.tag_add('wrapindent', 'end -2c linestart', 'end -2c') # Run a partial parser, and extract gold & test chunks chunker = RegexpChunkParser(rules[:i]) test_tree = self._chunkparse(gold_tree.leaves()) gold_chunks = self._chunks(gold_tree) test_chunks = self._chunks(test_tree) # Compare them. for chunk in gold_chunks.intersection(test_chunks): self._color_chunk(i, chunk, 'true-pos') for chunk in gold_chunks - test_chunks: self._color_chunk(i, chunk, 'false-neg') for chunk in test_chunks - gold_chunks: self._color_chunk(i, chunk, 'false-pos') self.devsetbox.insert('end', 'Finished.\n') self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c') # This is a hack, because the x-scrollbar isn't updating its # position right -- I'm not sure what the underlying cause is # though. (This is on OS X w/ python 2.5) self.top.after(100, self.devset_xscroll.set, 0, 0.3) def show_help(self, tab): self.helpbox['state'] = 'normal' self.helpbox.delete('1.0', 'end') for (name, tabstops, text) in self.HELP: if name == tab: text = text.replace( '<>', '\n'.join( ( '\t%s\t%s' % item for item in sorted( list(self.tagset.items()), key=lambda t_w: re.match('\w+', t_w[0]) and (0, t_w[0]) or (1, t_w[0]), ) ) ), ) self.helptabs[name].config(**self._HELPTAB_FG_PARAMS) self.helpbox.config(tabs=tabstops) self.helpbox.insert('1.0', text + '\n' * 20) C = '1.0 + %d chars' for (tag, params) in self.HELP_AUTOTAG: pattern = '(?s)(<%s>)(.*?)()' % (tag, tag) for m in re.finditer(pattern, text): self.helpbox.tag_add('elide', C % m.start(1), C % m.end(1)) self.helpbox.tag_add( 'tag-%s' % tag, C % m.start(2), C % m.end(2) ) self.helpbox.tag_add('elide', C % m.start(3), C % m.end(3)) else: self.helptabs[name].config(**self._HELPTAB_BG_PARAMS) self.helpbox['state'] = 'disabled' def _history_prev(self, *e): self._view_history(self._history_index - 1) return 'break' def _history_next(self, *e): self._view_history(self._history_index + 1) return 'break' def _view_history(self, index): # Bounds & sanity checking: index = max(0, min(len(self._history) - 1, index)) if not self._history: return # Already viewing the requested history item? if index == self._history_index: return # Show the requested grammar. It will get added to _history # only if they edit it (causing self.update() to get run.) self.grammarbox['state'] = 'normal' self.grammarbox.delete('1.0', 'end') self.grammarbox.insert('end', self._history[index][0]) self.grammarbox.mark_set('insert', '1.0') self._history_index = index self._syntax_highlight_grammar(self._history[index][0]) # Record the normalized grammar & regenerate the chunker. self.normalized_grammar = self.normalize_grammar(self._history[index][0]) if self.normalized_grammar: rules = [ RegexpChunkRule.fromstring(line) for line in self.normalized_grammar.split('\n') ] else: rules = [] self.chunker = RegexpChunkParser(rules) # Show the score. self._eval_plot() # Update the devset box self._highlight_devset() if self._showing_trace: self.show_trace() # Update the grammar label if self._history_index < len(self._history) - 1: self.grammarlabel['text'] = 'Grammar %s/%s:' % ( self._history_index + 1, len(self._history), ) else: self.grammarlabel['text'] = 'Grammar:' def _devset_next(self, *e): self._devset_scroll('scroll', 1, 'page') return 'break' def _devset_prev(self, *e): self._devset_scroll('scroll', -1, 'page') return 'break' def destroy(self, *e): if self.top is None: return self.top.destroy() self.top = None def _devset_scroll(self, command, *args): N = 1 # size of a page -- one sentence. showing_trace = self._showing_trace if command == 'scroll' and args[1].startswith('unit'): self.show_devset(self.devset_index + int(args[0])) elif command == 'scroll' and args[1].startswith('page'): self.show_devset(self.devset_index + N * int(args[0])) elif command == 'moveto': self.show_devset(int(float(args[0]) * self._devset_size.get())) else: assert 0, 'bad scroll command %s %s' % (command, args) if showing_trace: self.show_trace() def show_devset(self, index=None): if index is None: index = self.devset_index # Bounds checking index = min(max(0, index), self._devset_size.get() - 1) if index == self.devset_index and not self._showing_trace: return self.devset_index = index self._showing_trace = False self.trace_button['state'] = 'normal' self.devset_button['state'] = 'disabled' # Clear the text box. self.devsetbox['state'] = 'normal' self.devsetbox['wrap'] = 'word' self.devsetbox.delete('1.0', 'end') self.devsetlabel['text'] = 'Development Set (%d/%d)' % ( (self.devset_index + 1, self._devset_size.get()) ) # Add the sentences sample = self.devset[self.devset_index : self.devset_index + 1] self.charnum = {} self.linenum = {0: 1} for sentnum, sent in enumerate(sample): linestr = '' for wordnum, (word, pos) in enumerate(sent.leaves()): self.charnum[sentnum, wordnum] = len(linestr) linestr += '%s/%s ' % (word, pos) self.charnum[sentnum, wordnum + 1] = len(linestr) self.devsetbox.insert('end', linestr[:-1] + '\n\n') # Highlight chunks in the dev set if self.chunker is not None: self._highlight_devset() self.devsetbox['state'] = 'disabled' # Update the scrollbar first = self.devset_index / self._devset_size.get() last = (self.devset_index + 2) / self._devset_size.get() self.devset_scroll.set(first, last) def _chunks(self, tree): chunks = set() wordnum = 0 for child in tree: if isinstance(child, Tree): if child.label() == self._chunk_label: chunks.add((wordnum, wordnum + len(child))) wordnum += len(child) else: wordnum += 1 return chunks def _syntax_highlight_grammar(self, grammar): if self.top is None: return self.grammarbox.tag_remove('comment', '1.0', 'end') self.grammarbox.tag_remove('angle', '1.0', 'end') self.grammarbox.tag_remove('brace', '1.0', 'end') self.grammarbox.tag_add('hangindent', '1.0', 'end') for lineno, line in enumerate(grammar.split('\n')): if not line.strip(): continue m = re.match(r'(\\.|[^#])*(#.*)?', line) comment_start = None if m.group(2): comment_start = m.start(2) s = '%d.%d' % (lineno + 1, m.start(2)) e = '%d.%d' % (lineno + 1, m.end(2)) self.grammarbox.tag_add('comment', s, e) for m in re.finditer('[<>{}]', line): if comment_start is not None and m.start() >= comment_start: break s = '%d.%d' % (lineno + 1, m.start()) e = '%d.%d' % (lineno + 1, m.end()) if m.group() in '<>': self.grammarbox.tag_add('angle', s, e) else: self.grammarbox.tag_add('brace', s, e) def _grammarcheck(self, grammar): if self.top is None: return self.grammarbox.tag_remove('error', '1.0', 'end') self._grammarcheck_errs = [] for lineno, line in enumerate(grammar.split('\n')): line = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', line) line = line.strip() if line: try: RegexpChunkRule.fromstring(line) except ValueError as e: self.grammarbox.tag_add( 'error', '%s.0' % (lineno + 1), '%s.0 lineend' % (lineno + 1) ) self.status['text'] = '' def update(self, *event): # Record when update was called (for grammarcheck) if event: self._last_keypress = time.time() # Read the grammar from the Text box. self.grammar = grammar = self.grammarbox.get('1.0', 'end') # If the grammar hasn't changed, do nothing: normalized_grammar = self.normalize_grammar(grammar) if normalized_grammar == self.normalized_grammar: return else: self.normalized_grammar = normalized_grammar # If the grammar has changed, and we're looking at history, # then stop looking at history. if self._history_index < len(self._history) - 1: self.grammarlabel['text'] = 'Grammar:' self._syntax_highlight_grammar(grammar) # The grammar has changed; try parsing it. If it doesn't # parse, do nothing. (flag error location?) try: # Note: the normalized grammar has no blank lines. if normalized_grammar: rules = [ RegexpChunkRule.fromstring(line) for line in normalized_grammar.split('\n') ] else: rules = [] except ValueError as e: # Use the un-normalized grammar for error highlighting. self._grammarcheck(grammar) self.chunker = None return self.chunker = RegexpChunkParser(rules) self.grammarbox.tag_remove('error', '1.0', 'end') self.grammar_changed = time.time() # Display the results if self._showing_trace: self.show_trace() else: self._highlight_devset() # Start the eval demon if not self._eval_demon_running: self._eval_demon() def _highlight_devset(self, sample=None): if sample is None: sample = self.devset[self.devset_index : self.devset_index + 1] self.devsetbox.tag_remove('true-pos', '1.0', 'end') self.devsetbox.tag_remove('false-neg', '1.0', 'end') self.devsetbox.tag_remove('false-pos', '1.0', 'end') # Run the grammar on the test cases. for sentnum, gold_tree in enumerate(sample): # Run the chunk parser test_tree = self._chunkparse(gold_tree.leaves()) # Extract gold & test chunks gold_chunks = self._chunks(gold_tree) test_chunks = self._chunks(test_tree) # Compare them. for chunk in gold_chunks.intersection(test_chunks): self._color_chunk(sentnum, chunk, 'true-pos') for chunk in gold_chunks - test_chunks: self._color_chunk(sentnum, chunk, 'false-neg') for chunk in test_chunks - gold_chunks: self._color_chunk(sentnum, chunk, 'false-pos') def _chunkparse(self, words): try: return self.chunker.parse(words) except (ValueError, IndexError) as e: # There's an error somewhere in the grammar, but we're not sure # exactly where, so just mark the whole grammar as bad. # E.g., this is caused by: "({})" self.grammarbox.tag_add('error', '1.0', 'end') # Treat it as tagging nothing: return words def _color_chunk(self, sentnum, chunk, tag): start, end = chunk self.devsetbox.tag_add( tag, '%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, start]), '%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, end] - 1), ) def reset(self): # Clear various variables self.chunker = None self.grammar = None self.normalized_grammar = None self.grammar_changed = 0 self._history = [] self._history_index = 0 # Update the on-screen display. self.grammarbox.delete('1.0', 'end') self.show_devset(0) self.update() # self._eval_plot() SAVE_GRAMMAR_TEMPLATE = ( '# Regexp Chunk Parsing Grammar\n' '# Saved %(date)s\n' '#\n' '# Development set: %(devset)s\n' '# Precision: %(precision)s\n' '# Recall: %(recall)s\n' '# F-score: %(fscore)s\n\n' '%(grammar)s\n' ) def save_grammar(self, filename=None): if not filename: ftypes = [('Chunk Gramamr', '.chunk'), ('All files', '*')] filename = asksaveasfilename(filetypes=ftypes, defaultextension='.chunk') if not filename: return if self._history and self.normalized_grammar == self.normalize_grammar( self._history[-1][0] ): precision, recall, fscore = [ '%.2f%%' % (100 * v) for v in self._history[-1][1:] ] elif self.chunker is None: precision = recall = fscore = 'Grammar not well formed' else: precision = recall = fscore = 'Not finished evaluation yet' with open(filename, 'w') as outfile: outfile.write( self.SAVE_GRAMMAR_TEMPLATE % dict( date=time.ctime(), devset=self.devset_name, precision=precision, recall=recall, fscore=fscore, grammar=self.grammar.strip(), ) ) def load_grammar(self, filename=None): if not filename: ftypes = [('Chunk Gramamr', '.chunk'), ('All files', '*')] filename = askopenfilename(filetypes=ftypes, defaultextension='.chunk') if not filename: return self.grammarbox.delete('1.0', 'end') self.update() with open(filename, 'r') as infile: grammar = infile.read() grammar = re.sub( '^\# Regexp Chunk Parsing Grammar[\s\S]*' 'F-score:.*\n', '', grammar ).lstrip() self.grammarbox.insert('1.0', grammar) self.update() def save_history(self, filename=None): if not filename: ftypes = [('Chunk Gramamr History', '.txt'), ('All files', '*')] filename = asksaveasfilename(filetypes=ftypes, defaultextension='.txt') if not filename: return with open(filename, 'w') as outfile: outfile.write('# Regexp Chunk Parsing Grammar History\n') outfile.write('# Saved %s\n' % time.ctime()) outfile.write('# Development set: %s\n' % self.devset_name) for i, (g, p, r, f) in enumerate(self._history): hdr = ( 'Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, ' 'fscore=%.2f%%)' % (i + 1, len(self._history), p * 100, r * 100, f * 100) ) outfile.write('\n%s\n' % hdr) outfile.write(''.join(' %s\n' % line for line in g.strip().split())) if not ( self._history and self.normalized_grammar == self.normalize_grammar(self._history[-1][0]) ): if self.chunker is None: outfile.write('\nCurrent Grammar (not well-formed)\n') else: outfile.write('\nCurrent Grammar (not evaluated)\n') outfile.write( ''.join(' %s\n' % line for line in self.grammar.strip().split()) ) def about(self, *e): ABOUT = "NLTK RegExp Chunk Parser Application\n" + "Written by Edward Loper" TITLE = 'About: Regular Expression Chunk Parser Application' try: from six.moves.tkinter_messagebox import Message Message(message=ABOUT, title=TITLE).show() except: ShowText(self.top, TITLE, ABOUT) def set_devset_size(self, size=None): if size is not None: self._devset_size.set(size) self._devset_size.set(min(len(self.devset), self._devset_size.get())) self.show_devset(1) self.show_devset(0) # what about history? Evaluated at diff dev set sizes! def resize(self, size=None): if size is not None: self._size.set(size) size = self._size.get() self._font.configure(size=-(abs(size))) self._smallfont.configure(size=min(-10, -(abs(size)) * 14 // 20)) def mainloop(self, *args, **kwargs): """ Enter the Tkinter mainloop. This function must be called if this demo is created from a non-interactive program (e.g. from a secript); otherwise, the demo will close as soon as the script completes. """ if in_idle(): return self.top.mainloop(*args, **kwargs) def app(): RegexpChunkApp().mainloop() if __name__ == '__main__': app() __all__ = ['app']