Source code for soton_corenlppy.re.dataset_support_lib

# !/usr/bin/env python
# -*- coding: utf-8 -*-

"""
..
	/////////////////////////////////////////////////////////////////////////
	//
	// (c) Copyright University of Southampton IT Innovation, 2019
	//
	// Copyright in this software belongs to IT Innovation Centre of
	// Gamma House, Enterprise Road, Southampton SO16 7NS, UK.
	//
	// This software may not be used, sold, licensed, transferred, copied
	// or reproduced in whole or in part in any manner or form or in or
	// on any media by any person other than in accordance with the terms
	// of the Licence Agreement supplied with the software, or otherwise
	// without the prior written consent of the copyright owners.
	//
	// This software is distributed WITHOUT ANY WARRANTY, without even the
	// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
	// PURPOSE, except where stated in the Licence Agreement supplied with
	// the software.
	//
	// Created By : Stuart E. Middleton
	// Created Date : 2019/08/31
	// Created for Project: FLORAGUARD
	//
	/////////////////////////////////////////////////////////////////////////
	//
	// Dependancies: None
	//
	/////////////////////////////////////////////////////////////////////////
	'''

Support lib for working with pretrained embedding datasets and other large NLP corpora

"""

import array,sys,codecs,os,re,copy,math,multiprocessing,threading,traceback,logging,time,tempfile,subprocess,datetime,signal
import soton_corenlppy, soton_corenlppy.re

#file_bert_vocab = '/projects/datasets/bert/wwm_cased_L-24_H-1024_A-16/vocab.txt'
#dir_propbank = '/projects/datasets/propbank-release-master'
#dir_ewt = '/projects/datasets/LDC/english web treebank/eng_web_tbk'
#training_file_limit = 20
#test_file_limit = 2
#bool_bert_vocab_lowercase = False

#file_bert_vocab = '/floraguard/bert-experiment/datasets/bert/wwm_cased_L-24_H-1024_A-16/vocab.txt'
#dir_propbank = '/floraguard/bert-experiment/datasets/propbank-release-master'
#dir_ewt = '/floraguard/bert-experiment/datasets/eng_web_tbk'

#
# propbank support functions
#

[docs]def read_propbank( propbank_dir = None, ewt_dir = None, max_files = None, dict_openie_config = None ) : """ read in the Propbank dataset and cross-index it with the English Web Treebank dataset to provide a set of SRL annotated sentences. :param unicode propbank_dir: location of Propbank dataset dir :param unicode ewt_dir: location of English Web Treebank dataset dir :param int max_files: max number of files to load (None for all files). this is useful for testing purposes. :param dict dict_openie_config: config object returned from soton_corenlppy.re.openie_lib.get_openie_config() :return: dict of Propbank file sent SRL annotations = { EWT_filename : { sent_index : ( [ word_token1, ... ], [ pos_token1, ... ], [ [ iob_token1, ... ], ... x N_clauses_in_sent ] ) } } :rtype: dict """ if not isinstance( propbank_dir, str ) : raise Exception( 'invalid propbank_dir' ) if not isinstance( ewt_dir, str ) : raise Exception( 'invalid ewt_dir' ) if not isinstance( max_files, (int,type(None)) ) : raise Exception( 'invalid max_files' ) if not isinstance( dict_openie_config, dict ) : raise Exception( 'invalid dict_openie_config' ) dictFiles = {} # # read propbank data files (to get SRL and POS labels for words) # # e.g. \propbank-release-master\data\google\ewt\answers\00\20070404104007AAY1Chs_ans.xml.gold_skel listFilesToProcess = [] if os.path.exists( propbank_dir ) == False : raise Exception( 'propbank dir does not exist : ' + repr(propbank_dir) ) strEnglishWebTreebankDir = os.path.abspath( propbank_dir ) + os.sep + 'data' + os.sep + 'google' + os.sep + 'ewt' listFiles1 = os.listdir( strEnglishWebTreebankDir ) for strFile1 in listFiles1 : if os.path.isdir( strEnglishWebTreebankDir + os.sep + strFile1 ) : listFiles2 = os.listdir( strEnglishWebTreebankDir + os.sep + strFile1 ) for strFile2 in listFiles2 : if os.path.isdir( strEnglishWebTreebankDir + os.sep + strFile1 + os.sep + strFile2 ) : listFiles3 = os.listdir( strEnglishWebTreebankDir + os.sep + strFile1 + os.sep + strFile2 ) for strFile3 in listFiles3 : if strFile3.endswith( '.gold_skel' ) == True : listFilesToProcess.append( ( strEnglishWebTreebankDir + os.sep + strFile1 + os.sep + strFile2 + os.sep + strFile3, strFile1, strFile3 ) ) # set a limit on number of files to read (useful for testing prior to a full run) if max_files == None : nMaxFiles = 1000000 else : nMaxFiles = max_files nCountFiles = 0 while (nCountFiles < nMaxFiles) and (nCountFiles < len(listFilesToProcess)) : (strFileWithPath, strDataset, strFile) = listFilesToProcess[nCountFiles] readHandle = codecs.open( strFileWithPath, 'r', 'utf-8', errors = 'replace' ) listLines = readHandle.readlines() readHandle.close() # # Propbank example # #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 0 [WORD] WRB (TOP(S(SBARQ(WHADVP*) - - (ARGM-LOC*) * * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 1 [WORD] MD (SQ* - - (ARGM-MOD*) * * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 2 [WORD] PRP (NP*) - - (ARG0*) * * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 3 [WORD] VB (VP* get get.01 (V*) * * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 4 [WORD] NNS (NP*) - - (ARG1*) * * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 5 [WORD] IN (PP* - - (ARGM-LOC* * * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 6 [WORD] NNP (NP* - - * * * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 7 [WORD] NNP *))))) - - *) * * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 8 [WORD] , * - - * * * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 9 [WORD] PRP (S(S(NP*) - - * (ARG0*) * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 10 [WORD] MD (VP* - - * (ARGM-MOD*) * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 11 [WORD] VB (VP* liken like.01 * (V*) * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 12 [WORD] DT (NP* - - * (ARG1* * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 13 [WORD] JJ * - - * * * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 14 [WORD] NN *)))) - - * *) * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 15 [WORD] , * - - * * * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 16 [WORD] CC * - - * * * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 17 [WORD] PRP (S(NP*) - - * * (ARG0*) #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 18 [WORD] MD (VP* - - * * * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 19 [WORD] TO (S(VP* - - * * * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 20 [WORD] VB (VP* try try.01 * * (V*) #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 21 [WORD] NNS (NP*) - - * * (ARG1*) #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 22 [WORD] UH (INTJ*))))))) - - * * (ARGM-DIS*) #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 0 23 [WORD] . *)) - - * * * # #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 1 0 [WORD] PRP (TOP(S(S(NP*) - - (ARG0*) * * * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 1 1 [WORD] VBD (VP* search search.01 (V*) * * * #google/ewt/answers/00/20070404104007AAY1Chs_ans.xml 1 2 [WORD] RB (PP* - - (ARG1* * * * dictSents = {} listWords = [] listPOS = [] listSRL = [] listPredicateWordSense = [] nSRLCount = 0 nSentIndex = None for strLine in listLines : if len( strLine.strip() ) > 0 : # split by space and remove any empty strings (as there will be multiple spaces between values) listComponents = strLine.strip().split(' ') while listComponents.count('') : listComponents.remove('') if len(listComponents) < 7 : raise Exception('error parsing file (number of columns) ' + repr(strFileWithPath) + ' : ' + repr(strLine) ) # send index if nSentIndex != None : if int( listComponents[1] ) != nSentIndex : raise Exception('error parsing file (sent index) ' + repr(strFileWithPath) + ' : ' + repr(strLine) ) else : nSentIndex = int( listComponents[1] ) # word will always be [WORD] in propbank as we need to cordd-ref it with EWT tokenized sentences listWords.append( listComponents[3] ) listPOS.append( listComponents[4] ) listSRL.append( listComponents[8:] ) listPredicateWordSense.append( listComponents[7] ) if nSRLCount == 0 : nSRLCount = len( listComponents[8:] ) elif len( listComponents[8:] ) != nSRLCount : raise Exception('SRL count mismatch between words : ' + repr(strFileWithPath) + ' : ' + repr(strLine) ) else : # newline is a sentence delimiter if len(listWords) > 0 : listSRLSets = [] # no SRL? add sentence with all labels as 'O' if nSRLCount == 0 : listSRLSets.append( 'O'*len(listWords) ) else : # for each SRL entry add a new fully annotated sentence (so we will get several copies of sent with different SRL if it has multiple relations) # a SRL entry is always terminates with a ) so no need to look after open fragments # debug - force 1 SRL entry per sent for testing pourposes # nSRLCount = 1 # end debug for nIndexSRL in range(nSRLCount) : listSRLInstance = [] strOpenRole = None strOpenPred = None for nTokenIndex in range(len(listSRL)) : entry = listSRL[nTokenIndex] strSRL = entry[nIndexSRL] if strSRL.startswith('(') : strOpenRole = strSRL.strip('()*') strIOB = 'B-' + strOpenRole elif strOpenRole != None : strIOB = 'I-' + strOpenRole else : strIOB = 'O' # append the predicate word sense to the verb IOB tag so it can be used to train # in SRL tasks the predicate location is provided but not the predicate wordsense if strSRL.startswith('(V*') : strOpenPred = listPredicateWordSense[nTokenIndex] if strOpenPred != None : strIOB = strIOB + '-' + strOpenPred if strSRL.endswith(')') : strOpenRole = None strOpenPred = None listSRLInstance.append( strIOB ) listSRLSets.append( listSRLInstance ) if strOpenRole != None : raise Exception('SRL parse error : ' + repr(strFileWithPath) + ' : ' + repr(strLine) ) # { sent_index : ( list_words, list_pos, ( list_IOB, list_IOB, ... ) ) } dictSents[nSentIndex] = ( listWords, listPOS, listSRLSets ) # reset sent and start again listWords = [] listPOS = [] listPredicateWordSense = [] listSRL = [] nSRLCount = 0 nSentIndex = None # add propbank annotated sent data for this file # { ewt_source_filename : { sent_index : ( list_words, list_pos, ( list_IOB, list_IOB, ... ) ) } } strSourceFile = strFile[:-1*len('.xml.gold_skel')] + '.txt' strEWTSourceFile = ewt_dir + os.sep + 'data' + os.sep + strDataset + os.sep + 'source' + os.sep + 'source_text_ascii_tokenized' + os.sep + strSourceFile dictFiles[strEWTSourceFile] = dictSents # update file count nCountFiles = nCountFiles + 1 # # read english web treebank data files (to get original words to replace placeholders in propbank) # # e.g. \eng_web_tbk\data\answers\source\source_text_ascii_tokenized\20070404104007AAY1Chs_ans.txt for strEWTSourceFile in dictFiles : # load source file readHandle = codecs.open( strEWTSourceFile, 'r', 'utf-8', errors = 'replace' ) listLines = readHandle.readlines() readHandle.close() # # EWT example # #<en=1>where can I get morcillas in tampa bay , I will like the argentinian type , but I will to try anothers please ? #<en=2>I searched all over the internet , but I could not find one place in Tampa Bay that sells morcillas , also known as blood pudding , black pudding and blood sausages . #<en=3>I learned that morcillas are basically impossible to find all across the North American region . #<en=4>But I did find this website , www.igourmet.com , where they sell all types of sausages , including blood sausages ! #<en=5>So follow the link at the bottom and buy some blood sausages . #<en=6>huh ? #<en=7>yuck !! #<en=8>I do n't know , and it is because I do n't like them , do you know that , morcillas is coagulated blood from animals , ewww nSentIndex = 0 for strLine in listLines : if len( strLine.strip() ) > 0 : listComponents = strLine.strip().split(' ') if len(listComponents) < 1 : raise Exception('error parsing file (not enough tokens) ' + repr(strEWTSourceFile) + ' : ' + repr(strLine) ) # remove sent index from first token if listComponents[0].count('>') > 0 : listComponents[0] = listComponents[0][ listComponents[0].index('>') + 1 : ] else : raise Exception('error parsing file (missing sent marker on first token) ' + repr(strEWTSourceFile) + ' : ' + repr(strLine) ) # replace [WORD] propbank placeholders with the actual word from EWT source file ( listWords, listPOS, listSRLSets ) = dictFiles[strEWTSourceFile][nSentIndex] if len(listWords) != len(listComponents) : raise Exception('error source file has different number of tokens to propbank file ' + repr(strEWTSourceFile) + ' : ' + repr(strLine) ) for nIndexWord in range(len(listWords)) : listWords[nIndexWord] = listComponents[nIndexWord] # next sent nSentIndex = nSentIndex + 1 # all done return dictFiles
# # BERT support functions #
[docs]def generate_vocab( list_word_sets = None, list_tag_sets = None, dict_openie_config = None ) : """ make a set of (word,tag) sequences for a sentence in BERT format. e.g. sent = [CLS] ... [SEP] ... [SEP] [PAD] [PAD] [PAD] ... :param list list_word_sets: list of words for each sent :param list list_tag_sets: list of tags for each sent :param dict dict_openie_config: config object returned from soton_corenlppy.re.openie_lib.get_openie_config() :return: tuple = ( list_words_vocab, list_tags_vocab, list_predicates_vocab, dict_index_words, dict_index_tags, dict_index_predicates, index_pad_word, index_pad_tag, index_pad_predicate ) :rtype: tuple """ if not isinstance( list_word_sets, list ) : raise Exception( 'invalid list_word_sets' ) if not isinstance( list_tag_sets, list ) : raise Exception( 'invalid list_tag_sets' ) if not isinstance( dict_openie_config, dict ) : raise Exception( 'invalid dict_openie_config' ) if (list_tag_sets != None) and (len(list_tag_sets) != len(list_word_sets)) : raise Exception( 'mismatch num sets - word, tag' ) setWords = set([]) setTags = set([]) setPredicates = set([]) for nIndex in range(len(list_word_sets)) : listTaggedSent = [] listTokens = list_word_sets[nIndex].split(' ') if list_tag_sets != None : listTags = list_tag_sets[nIndex].split(' ') else : # if no tags are provided assign 'O' to all tokens listTags = ['O'] * len(listTokens) listTags[0] = '[CLS]' listTags[-1] = '[SEP]' for nIndexWord in range(len(listTokens)) : setWords.add( listTokens[nIndexWord] ) setTags.add( listTags[nIndexWord] ) if listTags[nIndexWord].startswith('B-V-') == True : # extract wordsense of predicate strPredicate = listTags[nIndexWord][ len('B-V-') : ] elif listTags[nIndexWord].startswith('I-V-') == True : strPredicate = 'X' else : strPredicate = 'O' setPredicates.add( strPredicate ) # make set into an index so we can lookup phrases later by index id listWords = list(setWords) listTags = list(setTags) listPredicates = list(setPredicates) # add token for unknown vocabulary (for test corpus which might have vocabulary not seen in training corpus) and padding listWords.append( '???' ) if not '[PAD]' in listWords : listWords.append( '[PAD]' ) if not '[PAD]' in listTags : listTags.append( '[PAD]' ) if not '[PAD]' in listPredicates : listPredicates.append( '[PAD]' ) # make an inverted index of words and tags dictIndexWord = {} for nIndexEntry in range(len(listWords)) : dictIndexWord[listWords[nIndexEntry]] = nIndexEntry # padding token for words is '[PAD]' nPaddingIndexWord = dictIndexWord['[PAD]'] dictIndexTag = {} for nIndexEntry in range(len(listTags)) : dictIndexTag[listTags[nIndexEntry]] = nIndexEntry # padding token for tags is '[PAD]' nPaddingIndexTag = dictIndexTag['[PAD]'] dictIndexPredicate = {} for nIndexEntry in range(len(listPredicates)) : dictIndexPredicate[listPredicates[nIndexEntry]] = nIndexEntry # padding token for predicates is '[PAD]' nPaddingIndexPredicate = dictIndexPredicate['[PAD]'] print(( 'padding word index = ', repr(nPaddingIndexWord) )) print(( 'padding tag index = ', repr(nPaddingIndexTag) )) print(( 'word list size = ', repr(len(listWords)) )) print(( 'tag list size = ', repr(len(listTags)) )) print(( 'predicate list size = ', repr(len(listPredicates)) )) print(( 'word index size = ', repr(len(dictIndexWord)) )) print(( 'tag index size = ', repr(len(dictIndexTag)) )) print(( 'tag index size = ', repr(len(dictIndexPredicate)) )) return ( listWords, listTags, listPredicates, dictIndexWord, dictIndexTag, dictIndexPredicate, nPaddingIndexWord, nPaddingIndexTag, nPaddingIndexPredicate )
[docs]def generate_sequence( index_words = None, index_tags = None, index_predicates = None, padding_word_value = None, padding_tag_value = None, padding_predicate_value = None, sequence_length = None, list_word_sets = None, list_tag_sets = None, dict_openie_config = None ) : """ make a set of (word,tag) sequences for each sentence :param dict index_words: index of words from generate_vocab() :param dict index_tags: index of tags from generate_vocab() :param dict index_predicates: index of predicates from generate_vocab() :param int padding_word_value: value of pad word in the index :param int padding_tag_value: value of pad word in the index :param int padding_predicate_value: value of pad word in the index :param int sequence_length: max length of sequence (e.g. sentence length) - can be None for no limit. This is needed to limit sents to a fixed size for use in embeddings with BERT :param list list_word_sets: list of vocab words from generate_vocab() :param list list_tag_sets: list of vocab words from generate_vocab() :param dict dict_openie_config: config object returned from soton_corenlppy.re.openie_lib.get_openie_config() :return: tuple = ( list_seq_words, list_seq_tags_categorical, list_seq_predicates_categorical, max_seq_length ) :rtype: tuple """ if not isinstance( index_words, dict ) : raise Exception( 'invalid index_words' ) if not isinstance( index_tags, dict ) : raise Exception( 'invalid index_tags' ) if not isinstance( index_predicates, dict ) : raise Exception( 'invalid index_predicates' ) if not isinstance( padding_word_value, int ) : raise Exception( 'invalid padding_word_value' ) if not isinstance( padding_predicate_value, int ) : raise Exception( 'invalid padding_predicate_value' ) if not isinstance( sequence_length, int ) : raise Exception( 'invalid sequence_length' ) if not isinstance( list_word_sets, list ) : raise Exception( 'invalid list_word_sets' ) if not isinstance( list_tag_sets, list ) : raise Exception( 'invalid list_tag_sets' ) if not isinstance( dict_openie_config, dict ) : raise Exception( 'invalid dict_openie_config' ) if (list_tag_sets != None) and (len(list_tag_sets) != len(list_word_sets)) : raise Exception( 'mismatch num sets - word, tag' ) if (sequence_length != None) and (sequence_length < 2) : raise Exception( 'sequence_length too small' ) nMaxLen = 0 listTaggedSentSet = [] for nIndex in range(len(list_word_sets)) : listTaggedSent = [] listTokens = list_word_sets[nIndex].split(' ') #print( 'sent = ', repr(listTokens) ) if sequence_length != None : # truncate sentence if its too long if len(listTokens) > sequence_length : listTokens = listTokens[:sequence_length] # force padded sequence length to be of this predefined length nMaxLen = sequence_length else : # get the longest sentence and use this as padded sequence length nMaxLen = max( nMaxLen, len(listTokens) ) if list_tag_sets != None : listTags = list_tag_sets[nIndex].split(' ') else : listTags = None #print( 'tags = ', repr(listTags) ) for nIndexWord in range(len(listTokens)) : # token not in the vocab list? replace with '???' if so strToken = listTokens[nIndexWord] if not listTokens[nIndexWord] in index_words : strToken = '???' # do we have a tag? if not its 'O' in IOB tagging scheme if listTags != None : strTag = listTags[nIndexWord] else : if nIndexWord == 0 : strTag = '[CLS]' elif nIndexWord == len(listTokens)-1 : strTag = '[SEP]' else : strTag = 'O' if strTag.startswith('B-V-') == True : strPredicate = strTag[ len('B-V-'): ] if not strPredicate in index_predicates : strPredicate = '???' elif strTag.startswith('I-V-') == True : strPredicate = 'X' else : strPredicate = 'O' listTaggedSent.append( ( strToken, strTag, strPredicate ) ) listTaggedSentSet.append( listTaggedSent ) #print( 'tagged sent = ', repr(listTaggedSent) ) print(( 'max token length = ', repr(nMaxLen) )) # generate sequences of words and tags for training # - sequence = [CLS] some words give me trouble [SEP] give [SEP] # note: sequences are padded to max_length of longest word sentence (including the predicates) so they have the same size allowing embedding token index values to be added later listSequenceSetWords = [] listSequenceSetPredicates = [] listSequenceSetTags = [] for listTaggedSent in listTaggedSentSet : listSequenceWords = [] listSequenceTags = [] listSequencePredicates = [] for (strWord,strTag,strPredicate) in listTaggedSent : listSequenceWords.append( index_words[strWord] ) listSequenceTags.append( index_tags[strTag] ) listSequencePredicates.append( index_predicates[strPredicate] ) #if len(listSequenceWords) != sequence_length : # raise Exception('word length incorrect : ' + repr(len(listSequenceWords)) ) listSequenceSetWords.append( listSequenceWords ) listSequenceSetPredicates.append( listSequencePredicates ) listSequenceSetTags.append( listSequenceTags ) # pad sequences so they all have the same length # use int16 to save memory. we need to represent word and tag indexes in the sequence so int16 allows 65,536‬ unique words to be represented in index (probbank has about 20,000 unique words, bert 30,000) if len(index_words) > 16*1024 : raise Exception('int16 too small for number of words') listSequenceSetWords = keras.preprocessing.sequence.pad_sequences( maxlen = nMaxLen, sequences = listSequenceSetWords, padding = 'post', value = padding_word_value, dtype='int16' ) if len(index_tags) > 16*1024 : raise Exception('int16 too small for number of words') listSequenceSetPredicates = keras.preprocessing.sequence.pad_sequences( maxlen = nMaxLen, sequences = listSequenceSetPredicates, padding = 'post', value = padding_predicate_value, dtype='int16' ) if len(index_predicates) > 16*1024 : raise Exception('int16 too small for number of words') listSequenceSetTags = keras.preprocessing.sequence.pad_sequences( maxlen = nMaxLen, sequences = listSequenceSetTags, padding = 'post', value = padding_tag_value, dtype='int16' ) # convert tag values to keras 'categorical' type as we have a multi-class problem (each word can have a tag from a set of mutually exclusive labels) # later this means we will use the loss function "categorical_crossentropy". # use int16 to save memory as this will be converted into large array = Matrix[Nsent_length x Nsent_length] x Nsent listSequenceSetTagsCategorical = [] for nIndexSequence in range(len(listSequenceSetTags)) : matrixObj = keras.utils.to_categorical( y = listSequenceSetTags[nIndexSequence], num_classes = len(index_tags), dtype='int16' ) listSequenceSetTagsCategorical.append( matrixObj ) # convert predicate values to keras 'categorical' type as we have a multi-class problem (each word can have a tag from a set of mutually exclusive labels) listSequenceSetPredicatesCategorical = [] for nIndexSequence in range(len(listSequenceSetPredicates)) : matrixObj = keras.utils.to_categorical( y = listSequenceSetPredicates[nIndexSequence], num_classes = len(index_predicates), dtype='int16' ) listSequenceSetPredicatesCategorical.append( matrixObj ) # return the values return ( listSequenceSetWords, listSequenceSetTagsCategorical, listSequenceSetPredicatesCategorical, nMaxLen )
[docs]def create_corpus( dict_propbank = None, pad_to_size = None, test_fraction = 0.1, dict_openie_config = None ) : """ create a BERT style training corpus from propbank data. e.g. sent = [CLS] ... [SEP] ... [SEP] [PAD] [PAD] [PAD] ... :param dict dict_propbank: propbank data from read_propbank() :param int pad_to_size: size to pad sequences to (can be None) :param float test_fraction: fraction of corpus to use as test data :param dict dict_openie_config: config object returned from soton_corenlppy.re.openie_lib.get_openie_config() :return: tuple = ( list_train_corpus_words, list_train_corpus_tags, list_test_corpus_words, list_test_corpus_tags ) :rtype: tuple """ if not isinstance( dict_propbank, dict ) : raise Exception( 'invalid dict_propbank' ) if not isinstance( pad_to_size, int ) : raise Exception( 'invalid pad_to_size' ) if not isinstance( test_fraction, (float,type(None)) ) : raise Exception( 'invalid test_fraction' ) if not isinstance( dict_openie_config, dict ) : raise Exception( 'invalid dict_openie_config' ) listTrainingCorpusWords = [] listTrainingCorpusTags = [] for strFile in dictPropbankFiles : dictSents = dictPropbankFiles[strFile] for nSentIndex in dictSents : ( listWords, listPOS, listSRLSets ) = dictSents[nSentIndex] # init tokenized structures listWordsTokenized = [] listPOSTokenized = [] listSRLSetsTokenized = [] for listSRL in listSRLSets : listSRLSetsTokenized.append( [] ) # apply WordPiece tokenization to each words and insert N pieces using original word/pos/srl tags for nIndexWord in range(len(listWords)) : strWord = listWords[nIndexWord] strPOS = listPOS[nIndexWord] # get word pieces for this word listWordPieces = wordpiece_tokenizer.tokenize( strWord ) # insert pieces (words and pos) for strPiece in listWordPieces : listWordsTokenized.append( strPiece ) listPOSTokenized.append( strPOS ) # insert pieces (srl) for nIndexSRLSet in range(len(listSRLSets)) : listSRL = listSRLSets[nIndexSRLSet] if len(listSRL) != len(listWords) : raise Exception( 'Word len != SRL len' ) bBegin = False for strPiece in listWordPieces : # ensure only first piece has the begin SRL label if (listSRL[nIndexWord].startswith('B-') == True) and (bBegin == True) : strSRL = 'I-' + listSRL[nIndexWord][2:] elif (listSRL[nIndexWord].startswith('B-') == True) and (bBegin == False) : strSRL = listSRL[nIndexWord] bBegin = True else : strSRL = listSRL[nIndexWord] listSRLSetsTokenized[nIndexSRLSet].append( strSRL ) # create the training corpus words and tags for listSRL in listSRLSetsTokenized : # get the first predicate token (if any) to be added at the end of the training sent strPredicatePhrase = '[NONE]' strPredicateTagPhrase = '[NONE]' for nIndexToken in range(len(listSRL)) : strTag = listSRL[nIndexToken] if strTag.startswith( 'B-V-' ) == True : strPredicatePhrase = listWordsTokenized[nIndexToken] strPredicateTagPhrase = strTag[ len('B-V-') : ] ''' listPredicateWords = [] listPredicateTags = [] for nIndexToken in range(len(listSRL)) : strTag = listSRL[nIndexToken] if (strTag.startswith( 'B-V' ) == True) or (strTag.startswith( 'I-V' ) == True) : strWord = listWordsTokenized[nIndexToken] listPredicateWords.append( strWord ) listPredicateTags.append( strTag ) if len(listPredicateWords) > 0 : strPredicatePhrase = ' '.join( listPredicateWords ) + ' ' strPredicateTagPhrase = ' '.join( listPredicateTags ) + ' ' else : strPredicatePhrase = '--- ' strPredicateTagPhrase = '--- ' ''' # pad sentence (leaving 3 for [SEP] pred [SEP] at end) if (pad_to_size != None) and (len(listWordsTokenized) < pad_to_size) : for nPadIndex in range( pad_to_size - len(listWordsTokenized) - 3 ) : listWordsTokenized.append( '[PAD]' ) listSRL.append( '[PAD]' ) # add a copy of the sent for each different SRL annotation # this means the same sent will appear with its full variety of SRL annotations listTrainingCorpusWords.append( '[CLS] ' + ' '.join( listWordsTokenized ) + ' [SEP] ' + strPredicatePhrase + ' [SEP]' ) listTrainingCorpusTags.append( '[CLS] ' + ' '.join( listSRL ) + ' [SEP] ' + strPredicateTagPhrase + ' [SEP]' ) # for testing make the test corpus a fraction of the training set (should get really good SRL classification rates) if test_fraction == None : nNumTestFiles = 0 else : nNumTestFiles = test_fraction * len(listTrainingCorpusWords) listTestCorpusWords = listTrainingCorpusWords[:nNumTestFiles] listTestCorpusTags = listTrainingCorpusTags[:nNumTestFiles] listTrainingCorpusWords = listTrainingCorpusWords[nNumTestFiles:] listTrainingCorpusTags = listTrainingCorpusTags[nNumTestFiles:] # all done return ( listTrainingCorpusWords, listTrainingCorpusTags, listTestCorpusWords, listTestCorpusTags )
# # streusle support functions #
[docs]def read_streusle( streusle_home = None, allowed_id_set = None, dict_config = None ) : """ read in streusle dataset. for information about the corpus see https://github.com/nert-nlp/streusle/blob/master/CONLLULEX.md :param unicode streusle_home: dir of streusle dataset :param list allowed_id_set: list of allowed IDs (None for no filter) :param dict dict_config: config object :return: dict = { sent_index : { 'sent_id' : <str>, 'text' : <str>, 'tokens' : <list>, 'phrases' : <dict>, 'phrases_addr' : <dict> }. 'tokens' will have a list of 19 columns with multi-word extraction address ranges converted from str to tuple(mwe_id,rel_position_in_mwe). 'phrases' is a dict with key verb|prep|noun and value of phrases. 'phrases_addr' is the same but with a list of token addresses for each phrase not a string. :rtype: dict """ if not isinstance( streusle_home, str ) : raise Exception( 'invalid streusle_home' ) if not isinstance( allowed_id_set, (list,type(None)) ) : raise Exception( 'invalid allowed_id_set' ) if not isinstance( dict_config, dict ) : raise Exception( 'invalid dict_config' ) if not 'verb_phrase_lexcat' in dict_config : raise Exception( 'verb_phrase_lexcat missing from config' ) if not 'prep_phrase_lexcat' in dict_config : raise Exception( 'prep_phrase_lexcat missing from config' ) if not 'noun_phrase_lexcat' in dict_config : raise Exception( 'noun_phrase_lexcat missing from config' ) dictClassPatterns = { 'verb' : dict_config['verb_phrase_lexcat'], 'prep' : dict_config['prep_phrase_lexcat'], 'noun' : dict_config['noun_phrase_lexcat'] } readHandle = None writeHandle = None # # load streusle data file 'streusle.conllulex' into memory as a dict indexed by streusle_sent_id # try : if os.path.exists( streusle_home ) == False : raise Exception( 'streusle home dir missing : ' + repr(streusle_home) ) strInputFile = os.path.abspath( streusle_home ) + os.sep + 'streusle.conllulex' readHandle = codecs.open( strInputFile, 'r', 'utf-8', errors = 'replace' ) listLines = readHandle.readlines() readHandle.close() # # newdoc id = reviews-001325 # # sent_id = reviews-001325-0001 # # text = Highly recommended # # streusle_sent_id = ewtb.r.001325.1 # # mwe = Highly recommended # 1 Highly highly ADV RB _ 2 advmod 2:advmod _ _ ADV highly _ _ _ _ _ O-ADV # 2 recommended recommend VERB VBN Tense=Past|VerbForm=Part 0 root 0:root _ _ V recommend v.communication _ _ _ _ O-V-v.communication dictSentSet = {} listTokens = [] strSentID = None dictSentTokens = {} nSentIndex = 0 nTokenIndex = 0 for strLine in listLines : strLineClean = strLine.strip() if strLineClean.startswith('# streusle_sent_id =') : strSentID = strLineClean[ len('# streusle_sent_id =') : ] elif len(strLineClean) == 0 : # compute text from tokens (not original untokenized text) # newline indicates the end of a sentence so add a new sent entry # note: make text by using tokens NOT the original text to preserve the token index for stuff like 'spot-on' == 'spot' '-' 'on' if (allowed_id_set == None) or (nSentIndex in allowed_id_set) : dictSentSet[ nSentIndex ] = { 'sent_id' : strSentID, 'text' : ' '.join( listTokens ), 'tokens' : dictSentTokens } # reset sent data listTokens = [] strSentID = None dictSentTokens = {} nTokenIndex = 0 nSentIndex = nSentIndex + 1 elif not strLineClean.startswith('#') : listParts = strLineClean.split('\t') if len(listParts) != 19 : raise Exception( 'token entry does not have 19 columns : ' + repr(strLineClean) ) # strong multi-word extraction (SMWE) = [ mwe_id, relative_token_position_in_group ] e.g. [1,1] = mwe #1, 1st token if not listParts[10] == '_' : listParts2 = listParts[10].split(':') tupleMWE = ( int(listParts2[0]), int(listParts2[1]) ) else : tupleMWE = None listParts[10] = tupleMWE # weak multi-word extraction (SMWE) = [ mwe_id, relative_token_position_in_group ] e.g. [1,1] = mwe #1, 1st token if not listParts[15] == '_' : listParts2 = listParts[15].split(':') tupleMWE = ( int(listParts2[0]), int(listParts2[1]) ) else : tupleMWE = None listParts[15] = tupleMWE # add token details to the dict. tokens appear sequentially so use the sequence index NOT the 1st column index as sometimes there is a value of 6, 7, *7.1*, 8 ... dictSentTokens[ nTokenIndex ] = listParts[1:] listTokens.append( listParts[1] ) nTokenIndex = nTokenIndex + 1 if strSentID != None : # add last sent entry if we have one left running if (allowed_id_set == None) or (nSentIndex in allowed_id_set) : dictSentSet[ nSentIndex ] = { 'sent_id' : strSentID, 'text' : ' '.join( listTokens ), 'tokens' : dictSentTokens } finally : if readHandle != None : readHandle.close() readHandle = None dict_config['logger'].info('loaded ' + str(len(dictSentSet)) + ' streusle sents') # # construct verb, prep and noun phrase sets, using the MWE (multi-word extraction) annotations # for nSentID in dictSentSet : dictPhrase = {} nGroupIndexSingle = 1000 # first pass get all single tokens and MWE that match a LEXCAT of interest for nTokenID in dictSentSet[nSentID]['tokens'] : # remember original index 0 was removed and became token_id strToken = dictSentSet[nSentID]['tokens'][nTokenID][0] strLexCat = dictSentSet[nSentID]['tokens'][nTokenID][10] # only bother with SMWE tupleMWE = dictSentSet[nSentID]['tokens'][nTokenID][9] for strType in dictClassPatterns : for strLexCatAllowed in dictClassPatterns[strType] : if strLexCatAllowed == strLexCat : if tupleMWE == None : # add single tokens in a unique new group ID dictPhrase[ nGroupIndexSingle ] = [ [ strToken ], [ nTokenID ], strType ] nGroupIndexSingle = nGroupIndexSingle + 1 else : # add multi-word tokens in the group they are assigned to nGroup = tupleMWE[0] nTokenPosInGroup = tupleMWE[1] # 1 = first token in group if not nGroup in dictPhrase : dictPhrase[nGroup] = [ [], [], strType ] if len(dictPhrase[nGroup][0]) < nTokenPosInGroup : listExtend = [''] * ( nTokenPosInGroup - len(dictPhrase[nGroup][0]) ) listExtendAddr = [-1] * ( nTokenPosInGroup - len(dictPhrase[nGroup][1]) ) dictPhrase[nGroup][0].extend( listExtend ) dictPhrase[nGroup][1].extend( listExtendAddr ) dictPhrase[nGroup][0][nTokenPosInGroup-1] = strToken dictPhrase[nGroup][1][nTokenPosInGroup-1] = nTokenID # second pass fill in any group tokens that are missing (as they might have a LEXCAT of '_') for nTokenID in dictSentSet[nSentID]['tokens'] : # remember original index 0 was removed and became token_id strToken = dictSentSet[nSentID]['tokens'][nTokenID][0] # SMWE tupleMWE = dictSentSet[nSentID]['tokens'][nTokenID][9] if tupleMWE != None : nGroup = tupleMWE[0] nTokenPosInGroup = tupleMWE[1] # if group is in phrase list then it matched LEXCAT # so add this MWE token to fill in any gaps if nGroup in dictPhrase : if len(dictPhrase[nGroup][0]) < nTokenPosInGroup : listExtend = [''] * ( nTokenPosInGroup - len(dictPhrase[nGroup][0]) ) listExtendAddr = [-1] * ( nTokenPosInGroup - len(dictPhrase[nGroup][1]) ) dictPhrase[nGroup][0].extend( listExtend ) dictPhrase[nGroup][1].extend( listExtendAddr ) dictPhrase[nGroup][0][nTokenPosInGroup-1] = strToken dictPhrase[nGroup][1][nTokenPosInGroup-1] = nTokenID # convert the phrase dict so it has the type as the index dictTypeIndexedPhrases = {} dictTypeIndexedPhrasesAddr = {} for nGroup in dictPhrase : listPhrase = dictPhrase[nGroup][0] listPhraseAddr = dictPhrase[nGroup][1] strType = dictPhrase[nGroup][2] if not strType in dictTypeIndexedPhrases : dictTypeIndexedPhrases[strType] = [] dictTypeIndexedPhrasesAddr[strType] = [] dictTypeIndexedPhrases[strType].append( ' '.join( listPhrase ) ) dictTypeIndexedPhrasesAddr[strType].append( listPhraseAddr ) # add phrase dict to sent data dictSentSet[nSentID]['phrases'] = dictTypeIndexedPhrases dictSentSet[nSentID]['phrases_addr'] = dictTypeIndexedPhrasesAddr return dictSentSet
[docs]def streusle_to_IOB( dict_sent_set = None, max_processes = 1, dict_config = None ) : """ compute a set of IOB tags for [noun, verb, prep] from a sent set returned by read_streusle() :param dict dict_sent_set: dict returned by read_streusle() :param int max_processes: max number of processes to use for POS tagging :param dict dict_config: config object :return: list of sents, each a list of IOB annotated token entries = [ [ ( token, pos, IOB ), ... ], ... ] :rtype: list """ if not isinstance( dict_sent_set, dict ) : raise Exception( 'invalid dict_sent_set' ) if not isinstance( max_processes, int ) : raise Exception( 'invalid max_processes' ) if not isinstance( dict_config, dict ) : raise Exception( 'invalid dict_config' ) # IOB files have a line per token, and newline to indicate a new sentence # streusle IOB annotations have classes for noun, verb and prep # # e.g. # # I PRP B-noun # love VRB B-verb # New NN B-noun # York NN I-noun # ! - O # # Its PRP b-noun # lovely VRB B-verb # # POS tag all sentences (replacing the POS tags which are not stanford POS tags) dictSentTokenized = {} dictText = {} for nSentID in dict_sent_set : listTokens = [] for nTokenID in dict_sent_set[nSentID]['tokens'] : strToken = dict_sent_set[nSentID]['tokens'][nTokenID][0] listTokens.append( strToken ) # sent id's are strings strSentID = str(nSentID) dictSentTokenized[ strSentID ] = [ listTokens ] dictText[ strSentID ] = dict_sent_set[nSentID]['text'] # run stanford coreNLP to do tokenize, pos # the tokenize is constrained to only split sents at EOL so the input sent count == output sent count # also only split using whitespace, so the original streusle tokenization is preserved (and hence the number of tokens remains the same) ( dictStanfordTokens, dictStanfordPOS ) = soton_corenlppy.re.comp_sem_lib.exec_stanford_corenlp( dict_text = dictText, work_dir = os.path.abspath( '.' ), annotators = 'tokenize,ssplit,pos', option_list = ['-tokenizeOptions','americanize=false','-tokenize.whitespace','true','-ssplit.eolonly','true'], num_processes = max_processes, dict_openie_config = dict_config ) ''' # relic NLTK POS tagger use dictTaggedSents = soton_corenlppy.common_parse_lib.pos_tag_tokenset_batch( document_token_set = dictSentTokenized, lang = 'en', dict_common_config = dict_config, max_processes = max_processes, timeout = 300 ) ''' # compute IOB tags listType = [ 'noun','verb','prep' ] listSentsIOB = [] strLastClass = None listSortedSentID = sorted( list( dict_sent_set.keys() ) ) for nSentID in listSortedSentID : strSentID = str(nSentID) dictTypeIndexedPhrasesAddr = dict_sent_set[nSentID]['phrases_addr'] # stanford POS sent id's are strings treeObjPOS = dictStanfordPOS[strSentID] if len(dict_sent_set[nSentID]['tokens']) != len(treeObjPOS) : raise Exception( 'stanford CoreNLP tokenization != streusle tokenization (review stanford options this should not be possible with -tokenize.whitespace true' ) listIOB = [] for nTokenID in dict_sent_set[nSentID]['tokens'] : strToken = dict_sent_set[nSentID]['tokens'][nTokenID][0] #strPOS = dictTaggedSents[nSentID][0][nTokenID][1] strPOS = treeObjPOS[nTokenID].label() # look to see if the token address appears in a labelled phrase. if so thats its class strClass = None for strType in listType : if strType in dictTypeIndexedPhrasesAddr : for listPhraseAddr in dictTypeIndexedPhrasesAddr[strType] : for nAddr in listPhraseAddr : if nAddr == nTokenID : strClass = strType break if strClass != None : break if strClass != None : break if strClass == None : strIOBTag = 'O' elif strClass == strLastClass : strIOBTag = 'I-' + strClass else : strIOBTag = 'B-' + strClass strLastClass = strClass listIOB.append( ( strToken, strPOS, strIOBTag ) ) # new sent listSentsIOB.append( listIOB ) # all done return listSentsIOB
[docs]def sentences_to_IOB( sentences_file = None, allowed_id_set = None, tokenize_sents = True, max_processes = 1, dict_config = None ) : """ read in a sentence file, then do POS tagging and generate a IOB file with IOB tag set to default 'O'. if sentence file has no tabs its assumed sent index is the row number. if sentence file has tabs its assumed 1st column is sent index, second column is text. :param unicode sentences_file: sentences filename to read :param list allowed_id_set: list of allowed IDs (None for no filter) :param bool tokenize_sents: if True use Treebank to tokenize, otherwise split sent using spaces :param int max_processes: max number of processes to use for POS tagging :param dict dict_config: config object :return: sent_ID's, sent_IOB = list of sent ID's; list of sents, each a list of IOB annotated token entries = [ [ ( token, pos, IOB ), ... ], ... ] :rtype: list, list """ if not isinstance( sentences_file, str ) : raise Exception( 'invalid sentences_file' ) if not isinstance( allowed_id_set, (list,type(None)) ) : raise Exception( 'invalid allowed_id_set' ) if not isinstance( tokenize_sents, bool ) : raise Exception( 'invalid tokenize_sents' ) if not isinstance( max_processes, int ) : raise Exception( 'invalid max_processes' ) if not isinstance( dict_config, dict ) : raise Exception( 'invalid dict_config' ) # read in sentences text readHandle = None dictSentences = {} try : readHandle = codecs.open( sentences_file, 'r', 'utf-8', errors = 'replace' ) listLines = readHandle.readlines() readHandle.close() nCount = 0 for strLine in listLines : strLineClean = strLine.strip() if len(strLineClean) == 0 : continue listParts = strLineClean.split('\t') if len(listParts) > 1 : strSentID = str(listParts[0]) if (allowed_id_set == None) or (strSentID in allowed_id_set) : dictSentences[strSentID] = listParts[1] else : strSentID = str(nCount) if (allowed_id_set == None) or (strSentID in allowed_id_set) : dictSentences[strSentID] = listParts[0] nCount = nCount + 1 finally : if readHandle != None : readHandle.close() # tokenization and POS tagging dictSentTokenized = {} for strSentID in dictSentences : strUTF8Text = dictSentences[strSentID] if tokenize_sents == True : listTokens = soton_corenlppy.common_parse_lib.unigram_tokenize_text( text = strUTF8Text, dict_common_config = dict_config ) else : listTokens = strUTF8Text.split(' ') dictSentTokenized[ strSentID ] = [ listTokens ] # POS tag document set # run stanford coreNLP to do tokenize, pos # the tokenize is constrained to only split sents at EOL so the input sent count == output sent count # also only split using whitespace, so the original streusle tokenization is preserved (and hence the number of tokens remains the same) ( dictStanfordTokens, dictStanfordPOS ) = soton_corenlppy.re.comp_sem_lib.exec_stanford_corenlp( dict_text = dictSentences, work_dir = os.path.abspath( '.' ), annotators = 'tokenize,ssplit,pos', option_list = ['-tokenizeOptions','americanize=false','-tokenize.whitespace','true','-ssplit.eolonly','true'], num_processes = max_processes, dict_openie_config = dict_config ) ''' # relic NLTK POS tagging dictTaggedSents = soton_corenlppy.common_parse_lib.pos_tag_tokenset_batch( document_token_set = dictSentTokenized, lang = 'en', dict_common_config = dict_config, max_processes = max_processes, timeout = 300 ) ''' # make IOB with 'O' tags listSentIOB = [] listSortedSentID = sorted( list( dictSentences.keys() ) ) for strSentID in listSortedSentID : treeObjPOS = dictStanfordPOS[strSentID] listIOB = [] #for listPOS in dictTaggedSents[nSentIndex][0] : # listIOB.append( ( listPOS[0], listPOS[1], 'O' ) ) for nToken in range(len(treeObjPOS)) : strToken = ' '.join( treeObjPOS[nToken].leaves() ) strPOS = treeObjPOS[nToken].label() listIOB.append( ( strToken, strPOS, 'O' ) ) listSentIOB.append( listIOB ) # all done return listSortedSentID, listSentIOB