Source code for soton_corenlppy.re.dataset_support_lib

# !/usr/bin/env python
# -*- coding: utf-8 -*-

"""
..
	/////////////////////////////////////////////////////////////////////////
	//
	// (c) Copyright University of Southampton IT Innovation, 2019
	//
	// Copyright in this software belongs to IT Innovation Centre of
	// Gamma House, Enterprise Road, Southampton SO16 7NS, UK.
	//
	// This software may not be used, sold, licensed, transferred, copied
	// or reproduced in whole or in part in any manner or form or in or
	// on any media by any person other than in accordance with the terms
	// of the Licence Agreement supplied with the software, or otherwise
	// without the prior written consent of the copyright owners.
	//
	// This software is distributed WITHOUT ANY WARRANTY, without even the
	// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
	// PURPOSE, except where stated in the Licence Agreement supplied with
	// the software.
	//
	// Created By : Stuart E. Middleton
	// Created Date : 2019/08/31
	// Created for Project: FLORAGUARD
	//
	/////////////////////////////////////////////////////////////////////////
	//
	// Dependancies: None
	//
	/////////////////////////////////////////////////////////////////////////
	'''

Support lib for working with pretrained embedding datasets and other large NLP corpora

"""

import array,sys,codecs,os,re,copy,math,multiprocessing,threading,traceback,logging,time,tempfile,subprocess,datetime,signal
import soton_corenlppy, soton_corenlppy.re

#file_bert_vocab = '/projects/datasets/bert/wwm_cased_L-24_H-1024_A-16/vocab.txt'
#dir_propbank = '/projects/datasets/propbank-release-master'
#dir_ewt = '/projects/datasets/LDC/english web treebank/eng_web_tbk'
#training_file_limit = 20
#test_file_limit = 2
#bool_bert_vocab_lowercase = False

#file_bert_vocab = '/floraguard/bert-experiment/datasets/bert/wwm_cased_L-24_H-1024_A-16/vocab.txt'
#dir_propbank = '/floraguard/bert-experiment/datasets/propbank-release-master'
#dir_ewt = '/floraguard/bert-experiment/datasets/eng_web_tbk'

#
# propbank support functions
#

[docs]def read_propbank( propbank_dir = None, ewt_dir = None, max_files = None, dict_openie_config = None ) :
	"""
	read in the Propbank dataset and cross-index it with the English Web Treebank dataset to provide a set of SRL annotated sentences.

	:param unicode propbank_dir: location of Propbank dataset dir
	:param unicode ewt_dir: location of English Web Treebank dataset dir
	:param int max_files: max number of files to load (None for all files). this is useful for testing purposes.
	:param dict dict_openie_config: config object returned from soton_corenlppy.re.openie_lib.get_openie_config() 

	:return: dict of Propbank file sent SRL annotations = { EWT_filename : { sent_index : ( [ word_token1, ... ], [ pos_token1, ... ], [ [ iob_token1, ... ], ... x N_clauses_in_sent ] ) } }
	:rtype: dict
	"""

	if not isinstance( propbank_dir, str ) :
		raise Exception( 'invalid propbank_dir' )
	if not isinstance( ewt_dir, str ) :
		raise Exception( 'invalid ewt_dir' )
	if not isinstance( max_files, (int,type(None)) ) :
		raise Exception( 'invalid max_files' )
	if not isinstance( dict_openie_config, dict ) :
		raise Exception( 'invalid dict_openie_config' )

	dictFiles = {}

	#
	# read propbank data files (to get SRL and POS labels for words)
	#
	# e.g. \propbank-release-master\data\google\ewt\answers\00\20070404104007AAY1Chs_ans.xml.gold_skel

	listFilesToProcess = []
	if os.path.exists( propbank_dir ) == False :
		raise Exception( 'propbank dir does not exist : ' + repr(propbank_dir) ) 
	strEnglishWebTreebankDir = os.path.abspath( propbank_dir ) + os.sep + 'data' + os.sep + 'google' + os.sep + 'ewt'
	listFiles1 = os.listdir( strEnglishWebTreebankDir )
	for strFile1 in listFiles1 :

		if os.path.isdir( strEnglishWebTreebankDir + os.sep + strFile1 ) :
			listFiles2 = os.listdir( strEnglishWebTreebankDir + os.sep + strFile1 )

			for strFile2 in listFiles2 :

				if os.path.isdir( strEnglishWebTreebankDir + os.sep + strFile1 + os.sep + strFile2 ) :
					listFiles3 = os.listdir( strEnglishWebTreebankDir + os.sep + strFile1 + os.sep + strFile2 )

					for strFile3 in listFiles3 :
						if strFile3.endswith( '.gold_skel' ) == True :
							listFilesToProcess.append( ( strEnglishWebTreebankDir + os.sep + strFile1 + os.sep + strFile2 + os.sep + strFile3, strFile1, strFile3 ) )

	# set a limit on number of files to read (useful for testing prior to a full run)
	if max_files == None :
		nMaxFiles = 1000000
	else :
		nMaxFiles = max_files

	nCountFiles = 0
	while (nCountFiles < nMaxFiles) and (nCountFiles < len(listFilesToProcess)) :
		(strFileWithPath, strDataset, strFile) = listFilesToProcess[nCountFiles]

		readHandle = codecs.open( strFileWithPath, 'r', 'utf-8', errors = 'replace' )
		listLines = readHandle.readlines()
		readHandle.close()

		#
		# Propbank example
		#
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0   0   [WORD]   WRB    (TOP(S(SBARQ(WHADVP*)                 -        -  (ARGM-LOC*)           *            * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0   1   [WORD]    MD                    (SQ*                  -        -  (ARGM-MOD*)           *            * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0   2   [WORD]   PRP                    (NP*)                 -        -      (ARG0*)           *            * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0   3   [WORD]    VB                    (VP*                get   get.01         (V*)           *            * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0   4   [WORD]   NNS                    (NP*)                 -        -      (ARG1*)           *            * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0   5   [WORD]    IN                    (PP*                  -        -  (ARGM-LOC*            *            * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0   6   [WORD]   NNP                    (NP*                  -        -           *            *            * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0   7   [WORD]   NNP                       *)))))             -        -           *)           *            * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0   8   [WORD]     ,                       *                  -        -           *            *            * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0   9   [WORD]   PRP                (S(S(NP*)                 -        -           *       (ARG0*)           * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0  10   [WORD]    MD                    (VP*                  -        -           *   (ARGM-MOD*)           * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0  11   [WORD]    VB                    (VP*              liken  like.01           *          (V*)           * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0  12   [WORD]    DT                    (NP*                  -        -           *       (ARG1*            * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0  13   [WORD]    JJ                       *                  -        -           *            *            * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0  14   [WORD]    NN                       *))))              -        -           *            *)           * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0  15   [WORD]     ,                       *                  -        -           *            *            * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0  16   [WORD]    CC                       *                  -        -           *            *            * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0  17   [WORD]   PRP                  (S(NP*)                 -        -           *            *       (ARG0*)
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0  18   [WORD]    MD                    (VP*                  -        -           *            *            * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0  19   [WORD]    TO                  (S(VP*                  -        -           *            *            * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0  20   [WORD]    VB                    (VP*                try   try.01           *            *          (V*)
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0  21   [WORD]   NNS                    (NP*)                 -        -           *            *       (ARG1*)
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0  22   [WORD]    UH                  (INTJ*)))))))           -        -           *            *   (ARGM-DIS*)
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    0  23   [WORD]     .                       *))                -        -           *            *            * 
		#
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    1   0   [WORD]   PRP    (TOP(S(S(NP*)                       -          -  (ARG0*)           *          *            * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    1   1   [WORD]   VBD            (VP*                   search  search.01     (V*)           *          *            * 
		#google/ewt/answers/00/20070404104007AAY1Chs_ans.xml    1   2   [WORD]    RB            (PP*                        -          -  (ARG1*            *          *            * 

		dictSents = {}

		listWords = []
		listPOS = []
		listSRL = []
		listPredicateWordSense = []
		nSRLCount = 0
		nSentIndex = None
		for strLine in listLines :
			if len( strLine.strip() ) > 0 :

				# split by space and remove any empty strings (as there will be multiple spaces between values)
				listComponents = strLine.strip().split(' ')
				while listComponents.count('') :
					listComponents.remove('')

				if len(listComponents) < 7 :
					raise Exception('error parsing file (number of columns) ' + repr(strFileWithPath) + ' : ' + repr(strLine) )

				# send index
				if nSentIndex != None :
					if int( listComponents[1] ) != nSentIndex :
						raise Exception('error parsing file (sent index) ' + repr(strFileWithPath) + ' : ' + repr(strLine) )
				else :
					nSentIndex = int( listComponents[1] )

				# word will always be [WORD] in propbank as we need to cordd-ref it with EWT tokenized sentences
				listWords.append( listComponents[3] )
				listPOS.append( listComponents[4] )
				listSRL.append( listComponents[8:] )
				listPredicateWordSense.append( listComponents[7] )

				if nSRLCount == 0 :
					nSRLCount = len( listComponents[8:] )
				elif len( listComponents[8:] ) != nSRLCount :
					raise Exception('SRL count mismatch between words : ' + repr(strFileWithPath) + ' : ' + repr(strLine) )

			else :
				# newline is a sentence delimiter
				if len(listWords) > 0 :
					listSRLSets = []

					# no SRL? add sentence with all labels as 'O'
					if nSRLCount == 0 :
						listSRLSets.append( 'O'*len(listWords) )

					else :
						# for each SRL entry add a new fully annotated sentence (so we will get several copies of sent with different SRL if it has multiple relations)
						# a SRL entry is always terminates with a ) so no need to look after open fragments

						# debug - force 1 SRL entry per sent for testing pourposes
						# nSRLCount = 1
						# end debug

						for nIndexSRL in range(nSRLCount) :
							listSRLInstance = []
							strOpenRole = None
							strOpenPred = None

							for nTokenIndex in range(len(listSRL)) :
								entry = listSRL[nTokenIndex]
								strSRL = entry[nIndexSRL]

								if strSRL.startswith('(') :
									strOpenRole = strSRL.strip('()*')
									strIOB = 'B-' + strOpenRole
								elif strOpenRole != None :
									strIOB = 'I-' + strOpenRole
								else :
									strIOB = 'O'

								# append the predicate word sense to the verb IOB tag so it can be used to train
								# in SRL tasks the predicate location is provided but not the predicate wordsense
								if strSRL.startswith('(V*') :
									strOpenPred = listPredicateWordSense[nTokenIndex]
								
								if strOpenPred != None :
									strIOB = strIOB + '-' + strOpenPred

								if strSRL.endswith(')') :
									strOpenRole = None
									strOpenPred = None

								listSRLInstance.append( strIOB )

							listSRLSets.append( listSRLInstance )

							if strOpenRole != None :
								raise Exception('SRL parse error : ' + repr(strFileWithPath) + ' : ' + repr(strLine) )

					# { sent_index : ( list_words, list_pos, ( list_IOB, list_IOB, ... ) ) }
					dictSents[nSentIndex] = ( listWords, listPOS, listSRLSets )

				# reset sent and start again
				listWords = []
				listPOS = []
				listPredicateWordSense = []
				listSRL = []
				nSRLCount = 0
				nSentIndex = None

		# add propbank annotated sent data for this file
		# { ewt_source_filename : { sent_index : ( list_words, list_pos, ( list_IOB, list_IOB, ... ) ) } }
		strSourceFile = strFile[:-1*len('.xml.gold_skel')] + '.txt'
		strEWTSourceFile = ewt_dir + os.sep + 'data' + os.sep + strDataset + os.sep + 'source' + os.sep + 'source_text_ascii_tokenized' + os.sep + strSourceFile
		dictFiles[strEWTSourceFile] = dictSents

		# update file count
		nCountFiles = nCountFiles + 1

	#
	# read english web treebank data files (to get original words to replace placeholders in propbank)
	#
	# e.g. \eng_web_tbk\data\answers\source\source_text_ascii_tokenized\20070404104007AAY1Chs_ans.txt

	for strEWTSourceFile in dictFiles :
		# load source file
		readHandle = codecs.open( strEWTSourceFile, 'r', 'utf-8', errors = 'replace' )
		listLines = readHandle.readlines()
		readHandle.close()

		#
		# EWT example
		#
		#<en=1>where can I get morcillas in tampa bay , I will like the argentinian type , but I will to try anothers please ?
		#<en=2>I searched all over the internet , but I could not find one place in Tampa Bay that sells morcillas , also known as blood pudding , black pudding and blood sausages .
		#<en=3>I learned that morcillas are basically impossible to find all across the North American region .
		#<en=4>But I did find this website , www.igourmet.com , where they sell all types of sausages , including blood sausages !
		#<en=5>So follow the link at the bottom and buy some blood sausages .
		#<en=6>huh ?
		#<en=7>yuck !!
		#<en=8>I do n't know , and it is because I do n't like them , do you know that , morcillas is coagulated blood from animals , ewww

		nSentIndex = 0
		for strLine in listLines :

			if len( strLine.strip() ) > 0 :
				listComponents = strLine.strip().split(' ')
				if len(listComponents) < 1 :
					raise Exception('error parsing file (not enough tokens) ' + repr(strEWTSourceFile) + ' : ' + repr(strLine) )

				# remove sent index from first token
				if listComponents[0].count('>') > 0 :
					listComponents[0] = listComponents[0][ listComponents[0].index('>') + 1 : ]
				else :
					raise Exception('error parsing file (missing sent marker on first token) ' + repr(strEWTSourceFile) + ' : ' + repr(strLine) )
				
				# replace [WORD] propbank placeholders with the actual word from EWT source file
				( listWords, listPOS, listSRLSets ) = dictFiles[strEWTSourceFile][nSentIndex]
				if len(listWords) != len(listComponents) :
					raise Exception('error source file has different number of tokens to propbank file ' + repr(strEWTSourceFile) + ' : ' + repr(strLine) )
				for nIndexWord in range(len(listWords)) :
					listWords[nIndexWord] = listComponents[nIndexWord]

				# next sent
				nSentIndex = nSentIndex + 1

	# all done
	return dictFiles

#
# BERT support functions
#

[docs]def generate_vocab( list_word_sets = None, list_tag_sets = None, dict_openie_config = None ) :
	"""
	make a set of (word,tag) sequences for a sentence in BERT format.
	e.g. sent = [CLS] ... [SEP] ... [SEP] [PAD] [PAD] [PAD] ...

	:param list list_word_sets: list of words for each sent
	:param list list_tag_sets: list of tags for each sent
	:param dict dict_openie_config: config object returned from soton_corenlppy.re.openie_lib.get_openie_config() 

	:return: tuple = ( list_words_vocab, list_tags_vocab, list_predicates_vocab, dict_index_words, dict_index_tags, dict_index_predicates, index_pad_word, index_pad_tag, index_pad_predicate )
	:rtype: tuple
	"""

	if not isinstance( list_word_sets, list ) :
		raise Exception( 'invalid list_word_sets' )
	if not isinstance( list_tag_sets, list ) :
		raise Exception( 'invalid list_tag_sets' )
	if not isinstance( dict_openie_config, dict ) :
		raise Exception( 'invalid dict_openie_config' )

	if (list_tag_sets != None) and (len(list_tag_sets) != len(list_word_sets)) :
		raise Exception( 'mismatch num sets - word, tag' )

	setWords = set([])
	setTags = set([])
	setPredicates = set([])
	for nIndex in range(len(list_word_sets)) :
		listTaggedSent = []

		listTokens = list_word_sets[nIndex].split(' ')

		if list_tag_sets != None :
			listTags = list_tag_sets[nIndex].split(' ')
		else :
			# if no tags are provided assign 'O' to all tokens
			listTags = ['O'] * len(listTokens)
			listTags[0] = '[CLS]'
			listTags[-1] = '[SEP]'

		for nIndexWord in range(len(listTokens)) :
			setWords.add( listTokens[nIndexWord] )
			setTags.add( listTags[nIndexWord] )

			if listTags[nIndexWord].startswith('B-V-') == True :
				# extract wordsense of predicate
				strPredicate = listTags[nIndexWord][ len('B-V-') : ]
			elif listTags[nIndexWord].startswith('I-V-') == True :
				strPredicate = 'X'
			else :
				strPredicate = 'O'
			setPredicates.add( strPredicate )

	# make set into an index so we can lookup phrases later by index id
	listWords = list(setWords)
	listTags = list(setTags)
	listPredicates = list(setPredicates)

	# add token for unknown vocabulary (for test corpus which might have vocabulary not seen in training corpus) and padding
	listWords.append( '???' )
	if not '[PAD]' in listWords :
		listWords.append( '[PAD]' )
	if not '[PAD]' in listTags :
		listTags.append( '[PAD]' )
	if not '[PAD]' in listPredicates :
		listPredicates.append( '[PAD]' )

	# make an inverted index of words and tags
	dictIndexWord = {}
	for nIndexEntry in range(len(listWords)) :
		dictIndexWord[listWords[nIndexEntry]] = nIndexEntry

	# padding token for words is '[PAD]'
	nPaddingIndexWord = dictIndexWord['[PAD]']

	dictIndexTag = {}
	for nIndexEntry in range(len(listTags)) :
		dictIndexTag[listTags[nIndexEntry]] = nIndexEntry

	# padding token for tags is '[PAD]'
	nPaddingIndexTag = dictIndexTag['[PAD]']

	dictIndexPredicate = {}
	for nIndexEntry in range(len(listPredicates)) :
		dictIndexPredicate[listPredicates[nIndexEntry]] = nIndexEntry

	# padding token for predicates is '[PAD]'
	nPaddingIndexPredicate = dictIndexPredicate['[PAD]']

	print(( 'padding word index = ', repr(nPaddingIndexWord) ))
	print(( 'padding tag index = ', repr(nPaddingIndexTag) ))
	print(( 'word list size = ', repr(len(listWords)) ))
	print(( 'tag list size = ', repr(len(listTags)) ))
	print(( 'predicate list size = ', repr(len(listPredicates)) ))
	print(( 'word index size = ', repr(len(dictIndexWord)) ))
	print(( 'tag index size = ', repr(len(dictIndexTag)) ))
	print(( 'tag index size = ', repr(len(dictIndexPredicate)) ))

	return ( listWords, listTags, listPredicates, dictIndexWord, dictIndexTag, dictIndexPredicate, nPaddingIndexWord, nPaddingIndexTag, nPaddingIndexPredicate )

[docs]def generate_sequence( index_words = None, index_tags = None, index_predicates = None, padding_word_value = None, padding_tag_value = None, padding_predicate_value = None, sequence_length = None, list_word_sets = None, list_tag_sets = None, dict_openie_config = None ) :
	"""
	make a set of (word,tag) sequences for each sentence

	:param dict index_words: index of words from generate_vocab()
	:param dict index_tags: index of tags from generate_vocab()
	:param dict index_predicates: index of predicates from generate_vocab()
	:param int padding_word_value: value of pad word in the index
	:param int padding_tag_value: value of pad word in the index
	:param int padding_predicate_value: value of pad word in the index
	:param int sequence_length: max length of sequence (e.g. sentence length) - can be None for no limit. This is needed to limit sents to a fixed size for use in embeddings with BERT
	:param list list_word_sets: list of vocab words from generate_vocab()
	:param list list_tag_sets: list of vocab words from generate_vocab()
	:param dict dict_openie_config: config object returned from soton_corenlppy.re.openie_lib.get_openie_config() 

	:return: tuple = ( list_seq_words, list_seq_tags_categorical, list_seq_predicates_categorical, max_seq_length )
	:rtype: tuple
	"""

	if not isinstance( index_words, dict ) :
		raise Exception( 'invalid index_words' )
	if not isinstance( index_tags, dict ) :
		raise Exception( 'invalid index_tags' )
	if not isinstance( index_predicates, dict ) :
		raise Exception( 'invalid index_predicates' )
	if not isinstance( padding_word_value, int ) :
		raise Exception( 'invalid padding_word_value' )
	if not isinstance( padding_predicate_value, int ) :
		raise Exception( 'invalid padding_predicate_value' )
	if not isinstance( sequence_length, int ) :
		raise Exception( 'invalid sequence_length' )
	if not isinstance( list_word_sets, list ) :
		raise Exception( 'invalid list_word_sets' )
	if not isinstance( list_tag_sets, list ) :
		raise Exception( 'invalid list_tag_sets' )
	if not isinstance( dict_openie_config, dict ) :
		raise Exception( 'invalid dict_openie_config' )

	if (list_tag_sets != None) and (len(list_tag_sets) != len(list_word_sets)) :
		raise Exception( 'mismatch num sets - word, tag' )

	if (sequence_length != None) and (sequence_length < 2) :
		raise Exception( 'sequence_length too small' )

	nMaxLen = 0
	listTaggedSentSet = []
	for nIndex in range(len(list_word_sets)) :
		listTaggedSent = []

		listTokens = list_word_sets[nIndex].split(' ')

		#print( 'sent = ', repr(listTokens) )

		if sequence_length != None :
			# truncate sentence if its too long
			if len(listTokens) > sequence_length :
				listTokens = listTokens[:sequence_length]

			# force padded sequence length to be of this predefined length
			nMaxLen = sequence_length
		else :
			# get the longest sentence and use this as padded sequence length
			nMaxLen = max( nMaxLen, len(listTokens) )

		if list_tag_sets != None :
			listTags = list_tag_sets[nIndex].split(' ')
		else :
			listTags = None

		#print( 'tags = ', repr(listTags) )

		for nIndexWord in range(len(listTokens)) :
			# token not in the vocab list? replace with '???' if so
			strToken = listTokens[nIndexWord]
			if not listTokens[nIndexWord] in index_words :
				strToken = '???'
			
			# do we have a tag? if not its 'O' in IOB tagging scheme
			if listTags != None :
				strTag = listTags[nIndexWord]
			else :
				if nIndexWord == 0 :
					strTag = '[CLS]'
				elif nIndexWord == len(listTokens)-1 :
					strTag = '[SEP]'
				else :
					strTag = 'O'

			if strTag.startswith('B-V-') == True :
				strPredicate = strTag[ len('B-V-'): ]
				if not strPredicate in index_predicates :
					strPredicate = '???'
			elif strTag.startswith('I-V-') == True :
				strPredicate = 'X'
			else :
				strPredicate = 'O'

			listTaggedSent.append( ( strToken, strTag, strPredicate ) )

		listTaggedSentSet.append( listTaggedSent )

		#print( 'tagged sent = ', repr(listTaggedSent) )

	print(( 'max token length = ', repr(nMaxLen) ))

	# generate sequences of words and tags for training
	# - sequence = [CLS] some words give me trouble [SEP] give [SEP]
	# note: sequences are padded to max_length of longest word sentence (including the predicates) so they have the same size allowing embedding token index values to be added later

	listSequenceSetWords = []
	listSequenceSetPredicates = []
	listSequenceSetTags = []
	for listTaggedSent in listTaggedSentSet :
		listSequenceWords = []
		listSequenceTags = []
		listSequencePredicates = []

		for (strWord,strTag,strPredicate) in listTaggedSent :

			listSequenceWords.append( index_words[strWord] )
			listSequenceTags.append( index_tags[strTag] )
			listSequencePredicates.append( index_predicates[strPredicate] )
		
		#if len(listSequenceWords) != sequence_length :
		#	raise Exception('word length incorrect : ' + repr(len(listSequenceWords)) )

		listSequenceSetWords.append( listSequenceWords )
		listSequenceSetPredicates.append( listSequencePredicates )
		listSequenceSetTags.append( listSequenceTags )

	# pad sequences so they all have the same length
	# use int16 to save memory. we need to represent word and tag indexes in the sequence so int16 allows 65,536‬ unique words to be represented in index (probbank has about 20,000 unique words, bert 30,000)

	if len(index_words) > 16*1024 :
		raise Exception('int16 too small for number of words')
	listSequenceSetWords = keras.preprocessing.sequence.pad_sequences(
		maxlen = nMaxLen,
		sequences = listSequenceSetWords,
		padding = 'post',
		value = padding_word_value,
		dtype='int16' )

	if len(index_tags) > 16*1024 :
		raise Exception('int16 too small for number of words')
	listSequenceSetPredicates = keras.preprocessing.sequence.pad_sequences(
		maxlen = nMaxLen,
		sequences = listSequenceSetPredicates,
		padding = 'post',
		value = padding_predicate_value,
		dtype='int16' )

	if len(index_predicates) > 16*1024 :
		raise Exception('int16 too small for number of words')
	listSequenceSetTags = keras.preprocessing.sequence.pad_sequences(
		maxlen = nMaxLen,
		sequences = listSequenceSetTags,
		padding = 'post',
		value = padding_tag_value,
		dtype='int16' )

	# convert tag values to keras 'categorical' type as we have a multi-class problem (each word can have a tag from a set of mutually exclusive labels)
	# later this means we will use the loss function "categorical_crossentropy".
	# use int16 to save memory as this will be converted into large array = Matrix[Nsent_length x Nsent_length] x Nsent
	listSequenceSetTagsCategorical = []
	for nIndexSequence in range(len(listSequenceSetTags)) :
		matrixObj = keras.utils.to_categorical(
			y = listSequenceSetTags[nIndexSequence],
			num_classes = len(index_tags),
			dtype='int16' )
		listSequenceSetTagsCategorical.append( matrixObj )

	# convert predicate values to keras 'categorical' type as we have a multi-class problem (each word can have a tag from a set of mutually exclusive labels)
	listSequenceSetPredicatesCategorical = []
	for nIndexSequence in range(len(listSequenceSetPredicates)) :
		matrixObj = keras.utils.to_categorical(
			y = listSequenceSetPredicates[nIndexSequence],
			num_classes = len(index_predicates),
			dtype='int16' )
		listSequenceSetPredicatesCategorical.append( matrixObj )

	# return the values
	return ( listSequenceSetWords, listSequenceSetTagsCategorical, listSequenceSetPredicatesCategorical, nMaxLen )

[docs]def create_corpus( dict_propbank = None, pad_to_size = None, test_fraction = 0.1, dict_openie_config = None ) :
	"""
	create a BERT style training corpus from propbank data.
	e.g. sent = [CLS] ... [SEP] ... [SEP] [PAD] [PAD] [PAD] ...

	:param dict dict_propbank: propbank data from read_propbank()
	:param int pad_to_size: size to pad sequences to (can be None)
	:param float test_fraction: fraction of corpus to use as test data
	:param dict dict_openie_config: config object returned from soton_corenlppy.re.openie_lib.get_openie_config() 

	:return: tuple = ( list_train_corpus_words, list_train_corpus_tags, list_test_corpus_words, list_test_corpus_tags )
	:rtype: tuple
	"""

	if not isinstance( dict_propbank, dict ) :
		raise Exception( 'invalid dict_propbank' )
	if not isinstance( pad_to_size, int ) :
		raise Exception( 'invalid pad_to_size' )
	if not isinstance( test_fraction, (float,type(None)) ) :
		raise Exception( 'invalid test_fraction' )
	if not isinstance( dict_openie_config, dict ) :
		raise Exception( 'invalid dict_openie_config' )

	listTrainingCorpusWords = []
	listTrainingCorpusTags = []

	for strFile in dictPropbankFiles :
		dictSents = dictPropbankFiles[strFile]

		for nSentIndex in dictSents :
			( listWords, listPOS, listSRLSets ) = dictSents[nSentIndex]

			# init tokenized structures
			listWordsTokenized = []
			listPOSTokenized = []
			listSRLSetsTokenized = []
			for listSRL in listSRLSets :
				listSRLSetsTokenized.append( [] )

			# apply WordPiece tokenization to each words and insert N pieces using original word/pos/srl tags
			for nIndexWord in range(len(listWords)) :
				strWord = listWords[nIndexWord]
				strPOS = listPOS[nIndexWord]

				# get word pieces for this word
				listWordPieces = wordpiece_tokenizer.tokenize( strWord )

				# insert pieces (words and pos)
				for strPiece in listWordPieces :
					listWordsTokenized.append( strPiece )
					listPOSTokenized.append( strPOS )

				# insert pieces (srl)
				for nIndexSRLSet in range(len(listSRLSets)) :
					listSRL = listSRLSets[nIndexSRLSet]
					if len(listSRL) != len(listWords) :
						raise Exception( 'Word len != SRL len' )

					bBegin = False
					for strPiece in listWordPieces :

						# ensure only first piece has the begin SRL label
						if (listSRL[nIndexWord].startswith('B-') == True) and (bBegin == True) :
							strSRL = 'I-' + listSRL[nIndexWord][2:]
						elif (listSRL[nIndexWord].startswith('B-') == True) and (bBegin == False) :
							strSRL = listSRL[nIndexWord]
							bBegin = True
						else :
							strSRL = listSRL[nIndexWord]

						listSRLSetsTokenized[nIndexSRLSet].append( strSRL )

			# create the training corpus words and tags
			for listSRL in listSRLSetsTokenized :
				# get the first predicate token (if any) to be added at the end of the training sent
				strPredicatePhrase = '[NONE]'
				strPredicateTagPhrase = '[NONE]'

				for nIndexToken in range(len(listSRL)) :
					strTag = listSRL[nIndexToken]
					if strTag.startswith( 'B-V-' ) == True :
						strPredicatePhrase = listWordsTokenized[nIndexToken]
						strPredicateTagPhrase = strTag[ len('B-V-') : ]

				'''
				listPredicateWords = []
				listPredicateTags = []

				for nIndexToken in range(len(listSRL)) :
					strTag = listSRL[nIndexToken]
					if (strTag.startswith( 'B-V' ) == True) or (strTag.startswith( 'I-V' ) == True) :
						strWord = listWordsTokenized[nIndexToken]
						listPredicateWords.append( strWord )
						listPredicateTags.append( strTag )

				if len(listPredicateWords) > 0 :
					strPredicatePhrase = ' '.join( listPredicateWords ) + ' '
					strPredicateTagPhrase = ' '.join( listPredicateTags ) + ' '
				else :
					strPredicatePhrase = '--- '
					strPredicateTagPhrase = '--- '
				'''

				# pad sentence (leaving 3 for [SEP] pred [SEP] at end)
				if (pad_to_size != None) and (len(listWordsTokenized) < pad_to_size) :
					for nPadIndex in range( pad_to_size - len(listWordsTokenized) - 3 ) :
						listWordsTokenized.append( '[PAD]' )
						listSRL.append( '[PAD]' )
				
				# add a copy of the sent for each different SRL annotation
				# this means the same sent will appear with its full variety of SRL annotations
				listTrainingCorpusWords.append( '[CLS] ' + ' '.join( listWordsTokenized ) + ' [SEP] ' + strPredicatePhrase + ' [SEP]' )
				listTrainingCorpusTags.append( '[CLS] ' + ' '.join( listSRL ) + ' [SEP] ' + strPredicateTagPhrase + ' [SEP]' )

	# for testing make the test corpus a fraction of the training set (should get really good SRL classification rates)
	if test_fraction == None :
		nNumTestFiles = 0
	else :
		nNumTestFiles = test_fraction * len(listTrainingCorpusWords)

	listTestCorpusWords = listTrainingCorpusWords[:nNumTestFiles]
	listTestCorpusTags = listTrainingCorpusTags[:nNumTestFiles]
	listTrainingCorpusWords = listTrainingCorpusWords[nNumTestFiles:]
	listTrainingCorpusTags = listTrainingCorpusTags[nNumTestFiles:]

	# all done
	return ( listTrainingCorpusWords, listTrainingCorpusTags, listTestCorpusWords, listTestCorpusTags )

#
# streusle support functions
#

[docs]def read_streusle( streusle_home = None, allowed_id_set = None, dict_config = None ) :
	"""
	read in streusle dataset. for information about the corpus see https://github.com/nert-nlp/streusle/blob/master/CONLLULEX.md

	:param unicode streusle_home: dir of streusle dataset
	:param list allowed_id_set: list of allowed IDs (None for no filter)
	:param dict dict_config: config object

	:return: dict = { sent_index : { 'sent_id' : <str>, 'text' : <str>, 'tokens' : <list>, 'phrases' : <dict>, 'phrases_addr' : <dict> }. 'tokens' will have a list of 19 columns with multi-word extraction address ranges converted from str to tuple(mwe_id,rel_position_in_mwe). 'phrases' is a dict with key verb|prep|noun and value of phrases. 'phrases_addr' is the same but with a list of token addresses for each phrase not a string.
	:rtype: dict
	"""

	if not isinstance( streusle_home, str ) :
		raise Exception( 'invalid streusle_home' )
	if not isinstance( allowed_id_set, (list,type(None)) ) :
		raise Exception( 'invalid allowed_id_set' )
	if not isinstance( dict_config, dict ) :
		raise Exception( 'invalid dict_config' )

	if not 'verb_phrase_lexcat' in dict_config :
		raise Exception( 'verb_phrase_lexcat missing from config' )
	if not 'prep_phrase_lexcat' in dict_config :
		raise Exception( 'prep_phrase_lexcat missing from config' )
	if not 'noun_phrase_lexcat' in dict_config :
		raise Exception( 'noun_phrase_lexcat missing from config' )

	dictClassPatterns = {
		'verb' : dict_config['verb_phrase_lexcat'],
		'prep' : dict_config['prep_phrase_lexcat'],
		'noun' : dict_config['noun_phrase_lexcat']
		}

	readHandle = None
	writeHandle = None

	#
	# load streusle data file 'streusle.conllulex' into memory as a dict indexed by streusle_sent_id
	#
	try :
		if os.path.exists( streusle_home ) == False :
			raise Exception( 'streusle home dir missing : ' + repr(streusle_home) )
		strInputFile = os.path.abspath( streusle_home ) + os.sep + 'streusle.conllulex'
		readHandle = codecs.open( strInputFile, 'r', 'utf-8', errors = 'replace' )
		listLines = readHandle.readlines()
		readHandle.close()

		# # newdoc id = reviews-001325
		# # sent_id = reviews-001325-0001
		# # text = Highly recommended
		# # streusle_sent_id = ewtb.r.001325.1
		# # mwe = Highly recommended
		# 1	Highly	highly	ADV	RB	_	2	advmod	2:advmod	_	_	ADV	highly	_	_	_	_	_	O-ADV
		# 2	recommended	recommend	VERB	VBN	Tense=Past|VerbForm=Part	0	root	0:root	_	_	V	recommend	v.communication	_	_	_	_	O-V-v.communication

		dictSentSet = {}

		listTokens = []
		strSentID = None
		dictSentTokens = {}
		nSentIndex = 0
		nTokenIndex = 0

		for strLine in listLines :
			strLineClean = strLine.strip()

			if strLineClean.startswith('# streusle_sent_id =') :
				strSentID = strLineClean[ len('# streusle_sent_id =') : ]
			
			elif len(strLineClean) == 0 :
				# compute text from tokens (not original untokenized text)

				# newline indicates the end of a sentence so add a new sent entry
				# note: make text by using tokens NOT the original text to preserve the token index for stuff like 'spot-on' == 'spot' '-' 'on'
				if (allowed_id_set == None) or (nSentIndex in allowed_id_set) :
					dictSentSet[ nSentIndex ] = {
						'sent_id' : strSentID,
						'text' : ' '.join( listTokens ),
						'tokens' : dictSentTokens
						}

				# reset sent data
				listTokens = []
				strSentID = None
				dictSentTokens = {}
				nTokenIndex = 0
				nSentIndex = nSentIndex + 1

			elif not strLineClean.startswith('#') :
				listParts = strLineClean.split('\t')
				if len(listParts) != 19 :
					raise Exception( 'token entry does not have 19 columns : ' + repr(strLineClean) )

				# strong multi-word extraction (SMWE) = [ mwe_id, relative_token_position_in_group ] e.g. [1,1] = mwe #1, 1st token
				if not listParts[10] == '_' :
					listParts2 = listParts[10].split(':')
					tupleMWE = ( int(listParts2[0]), int(listParts2[1]) )
				else :
					tupleMWE = None
				listParts[10] = tupleMWE

				# weak multi-word extraction (SMWE) = [ mwe_id, relative_token_position_in_group ] e.g. [1,1] = mwe #1, 1st token
				if not listParts[15] == '_' :
					listParts2 = listParts[15].split(':')
					tupleMWE = ( int(listParts2[0]), int(listParts2[1]) )
				else :
					tupleMWE = None
				listParts[15] = tupleMWE

				# add token details to the dict. tokens appear sequentially so use the sequence index NOT the 1st column index as sometimes there is a value of 6, 7, *7.1*, 8 ... 
				dictSentTokens[ nTokenIndex ] = listParts[1:]
				listTokens.append( listParts[1] )
				nTokenIndex = nTokenIndex + 1

		if strSentID != None :
			# add last sent entry if we have one left running
			if (allowed_id_set == None) or (nSentIndex in allowed_id_set) :
				dictSentSet[ nSentIndex ] = {
					'sent_id' : strSentID,
					'text' : ' '.join( listTokens ),
					'tokens' : dictSentTokens
					}

	finally :
		if readHandle != None :
			readHandle.close()
		readHandle = None

	dict_config['logger'].info('loaded ' + str(len(dictSentSet)) + ' streusle sents')

	#
	# construct verb, prep and noun phrase sets, using the MWE (multi-word extraction) annotations
	#

	for nSentID in dictSentSet :
		dictPhrase = {}
		nGroupIndexSingle = 1000

		# first pass get all single tokens and MWE that match a LEXCAT of interest
		for nTokenID in dictSentSet[nSentID]['tokens'] :
			# remember original index 0 was removed and became token_id
			strToken = dictSentSet[nSentID]['tokens'][nTokenID][0]
			strLexCat = dictSentSet[nSentID]['tokens'][nTokenID][10]

			# only bother with SMWE
			tupleMWE = dictSentSet[nSentID]['tokens'][nTokenID][9]

			for strType in dictClassPatterns :
				for strLexCatAllowed in dictClassPatterns[strType] :
					if strLexCatAllowed == strLexCat :

						if tupleMWE == None :
							# add single tokens in a unique new group ID
							dictPhrase[ nGroupIndexSingle ] = [ [ strToken ], [ nTokenID ], strType ]
							nGroupIndexSingle = nGroupIndexSingle + 1
						else :
							# add multi-word tokens in the group they are assigned to
							nGroup = tupleMWE[0]
							nTokenPosInGroup = tupleMWE[1] # 1 = first token in group

							if not nGroup in dictPhrase :
								dictPhrase[nGroup] = [ [], [], strType ]
							
							if len(dictPhrase[nGroup][0]) < nTokenPosInGroup :
								listExtend = [''] * ( nTokenPosInGroup - len(dictPhrase[nGroup][0]) )
								listExtendAddr = [-1] * ( nTokenPosInGroup - len(dictPhrase[nGroup][1]) )
								dictPhrase[nGroup][0].extend( listExtend )
								dictPhrase[nGroup][1].extend( listExtendAddr )

							dictPhrase[nGroup][0][nTokenPosInGroup-1] = strToken
							dictPhrase[nGroup][1][nTokenPosInGroup-1] = nTokenID

		# second pass fill in any group tokens that are missing (as they might have a LEXCAT of '_')
		for nTokenID in dictSentSet[nSentID]['tokens'] :
			# remember original index 0 was removed and became token_id
			strToken = dictSentSet[nSentID]['tokens'][nTokenID][0]

			# SMWE
			tupleMWE = dictSentSet[nSentID]['tokens'][nTokenID][9]

			if tupleMWE != None :
				nGroup = tupleMWE[0]
				nTokenPosInGroup = tupleMWE[1]

				# if group is in phrase list then it matched LEXCAT
				# so add this MWE token to fill in any gaps
				if nGroup in dictPhrase :
					if len(dictPhrase[nGroup][0]) < nTokenPosInGroup :
						listExtend = [''] * ( nTokenPosInGroup - len(dictPhrase[nGroup][0]) )
						listExtendAddr = [-1] * ( nTokenPosInGroup - len(dictPhrase[nGroup][1]) )
						dictPhrase[nGroup][0].extend( listExtend )
						dictPhrase[nGroup][1].extend( listExtendAddr )

					dictPhrase[nGroup][0][nTokenPosInGroup-1] = strToken
					dictPhrase[nGroup][1][nTokenPosInGroup-1] = nTokenID

		# convert the phrase dict so it has the type as the index
		dictTypeIndexedPhrases = {}
		dictTypeIndexedPhrasesAddr = {}
		for nGroup in dictPhrase :
			listPhrase = dictPhrase[nGroup][0]
			listPhraseAddr = dictPhrase[nGroup][1]
			strType = dictPhrase[nGroup][2]

			if not strType in dictTypeIndexedPhrases :
				dictTypeIndexedPhrases[strType] = []
				dictTypeIndexedPhrasesAddr[strType] = []

			dictTypeIndexedPhrases[strType].append( ' '.join( listPhrase ) )
			dictTypeIndexedPhrasesAddr[strType].append( listPhraseAddr )

		# add phrase dict to sent data
		dictSentSet[nSentID]['phrases'] = dictTypeIndexedPhrases
		dictSentSet[nSentID]['phrases_addr'] = dictTypeIndexedPhrasesAddr

	return dictSentSet

[docs]def streusle_to_IOB( dict_sent_set = None, max_processes = 1, dict_config = None ) :
	"""
	compute a set of IOB tags for [noun, verb, prep] from a sent set returned by read_streusle()

	:param dict dict_sent_set: dict returned by read_streusle()
	:param int max_processes: max number of processes to use for POS tagging
	:param dict dict_config: config object

	:return: list of sents, each a list of IOB annotated token entries = [ [ ( token, pos, IOB ), ... ], ... ]
	:rtype: list
	"""

	if not isinstance( dict_sent_set, dict ) :
		raise Exception( 'invalid dict_sent_set' )
	if not isinstance( max_processes, int ) :
		raise Exception( 'invalid max_processes' )
	if not isinstance( dict_config, dict ) :
		raise Exception( 'invalid dict_config' )


	# IOB files have a line per token, and newline to indicate a new sentence
	# streusle IOB annotations have classes for noun, verb and prep
	#
	# e.g.
	#
	# I	PRP	B-noun
	# love	VRB	B-verb
	# New	NN	B-noun
	# York	NN	I-noun
	# !	-	O
	#
	# Its	PRP	b-noun
	# lovely	VRB	B-verb
	# 

	# POS tag all sentences (replacing the POS tags which are not stanford POS tags)

	dictSentTokenized = {}
	dictText = {}
	for nSentID in dict_sent_set :
		listTokens = []
		for nTokenID in dict_sent_set[nSentID]['tokens'] :
			strToken = dict_sent_set[nSentID]['tokens'][nTokenID][0]
			listTokens.append( strToken )
		
		# sent id's are strings
		strSentID = str(nSentID)
		dictSentTokenized[ strSentID ] = [ listTokens ]
		dictText[ strSentID ] = dict_sent_set[nSentID]['text']

	# run stanford coreNLP to do tokenize, pos
	# the tokenize is constrained to only split sents at EOL so the input sent count == output sent count
	# also only split using whitespace, so the original streusle tokenization is preserved (and hence the number of tokens remains the same)
	( dictStanfordTokens, dictStanfordPOS ) = soton_corenlppy.re.comp_sem_lib.exec_stanford_corenlp(
		dict_text = dictText,
		work_dir = os.path.abspath( '.' ),
		annotators = 'tokenize,ssplit,pos',
		option_list = ['-tokenizeOptions','americanize=false','-tokenize.whitespace','true','-ssplit.eolonly','true'],
		num_processes = max_processes,
		dict_openie_config = dict_config )

	'''
	# relic NLTK POS tagger use
	dictTaggedSents = soton_corenlppy.common_parse_lib.pos_tag_tokenset_batch( 
						document_token_set = dictSentTokenized,
						lang = 'en',
						dict_common_config = dict_config,
						max_processes = max_processes,
						timeout = 300 )
	'''

	# compute IOB tags
	listType = [ 'noun','verb','prep' ]
	listSentsIOB = []
	strLastClass = None
	listSortedSentID = sorted( list( dict_sent_set.keys() ) )
	for nSentID in listSortedSentID :

		strSentID = str(nSentID)

		dictTypeIndexedPhrasesAddr = dict_sent_set[nSentID]['phrases_addr']

		# stanford POS sent id's are strings
		treeObjPOS = dictStanfordPOS[strSentID]
		if len(dict_sent_set[nSentID]['tokens']) != len(treeObjPOS) :
			raise Exception( 'stanford CoreNLP tokenization != streusle tokenization (review stanford options this should not be possible with -tokenize.whitespace true' )
		listIOB = []

		for nTokenID in dict_sent_set[nSentID]['tokens'] :

			strToken = dict_sent_set[nSentID]['tokens'][nTokenID][0]
			#strPOS = dictTaggedSents[nSentID][0][nTokenID][1]
			strPOS = treeObjPOS[nTokenID].label()

			# look to see if the token address appears in a labelled phrase. if so thats its class
			strClass = None
			for strType in listType :
				if strType in dictTypeIndexedPhrasesAddr :
					for listPhraseAddr in dictTypeIndexedPhrasesAddr[strType] :
						for nAddr in listPhraseAddr :
							if nAddr == nTokenID :
								strClass = strType
								break
						if strClass != None :
							break
				if strClass != None :
					break

			if strClass == None :
				strIOBTag = 'O'
			elif strClass == strLastClass :
				strIOBTag = 'I-' + strClass
			else :
				strIOBTag = 'B-' + strClass

			strLastClass = strClass

			listIOB.append( ( strToken, strPOS, strIOBTag ) )

		# new sent
		listSentsIOB.append( listIOB )

	# all done
	return listSentsIOB

[docs]def sentences_to_IOB( sentences_file = None, allowed_id_set = None, tokenize_sents = True, max_processes = 1, dict_config = None ) :
	"""
	read in a sentence file, then do POS tagging and generate a IOB file with IOB tag set to default 'O'.
	if sentence file has no tabs its assumed sent index is the row number.
	if sentence file has tabs its assumed 1st column is sent index, second column is text.

	:param unicode sentences_file: sentences filename to read
	:param list allowed_id_set: list of allowed IDs (None for no filter)
	:param bool tokenize_sents: if True use Treebank to tokenize, otherwise split sent using spaces
	:param int max_processes: max number of processes to use for POS tagging
	:param dict dict_config: config object

	:return: sent_ID's, sent_IOB = list of sent ID's; list of sents, each a list of IOB annotated token entries = [ [ ( token, pos, IOB ), ... ], ... ]
	:rtype: list, list
	"""

	if not isinstance( sentences_file, str ) :
		raise Exception( 'invalid sentences_file' )
	if not isinstance( allowed_id_set, (list,type(None)) ) :
		raise Exception( 'invalid allowed_id_set' )
	if not isinstance( tokenize_sents, bool ) :
		raise Exception( 'invalid tokenize_sents' )
	if not isinstance( max_processes, int ) :
		raise Exception( 'invalid max_processes' )
	if not isinstance( dict_config, dict ) :
		raise Exception( 'invalid dict_config' )

	# read in sentences text

	readHandle = None
	dictSentences = {}

	try :
		readHandle = codecs.open( sentences_file, 'r', 'utf-8', errors = 'replace' )
		listLines = readHandle.readlines()
		readHandle.close()

		nCount = 0
		for strLine in listLines :

			strLineClean = strLine.strip()
			if len(strLineClean) == 0 :
				continue

			listParts = strLineClean.split('\t')
			if len(listParts) > 1 :
				strSentID = str(listParts[0])
				if (allowed_id_set == None) or (strSentID in allowed_id_set) :
					dictSentences[strSentID] = listParts[1]
			else :
				strSentID = str(nCount)
				if (allowed_id_set == None) or (strSentID in allowed_id_set) :
					dictSentences[strSentID] = listParts[0]

			nCount = nCount + 1

	finally :
		if readHandle != None :
			readHandle.close()

	# tokenization and POS tagging

	dictSentTokenized = {}
	for strSentID in dictSentences :
		strUTF8Text = dictSentences[strSentID]

		if tokenize_sents == True :
			listTokens = soton_corenlppy.common_parse_lib.unigram_tokenize_text( text = strUTF8Text, dict_common_config = dict_config )
		else :
			listTokens = strUTF8Text.split(' ')

		dictSentTokenized[ strSentID ] = [ listTokens ]

	# POS tag document set

	# run stanford coreNLP to do tokenize, pos
	# the tokenize is constrained to only split sents at EOL so the input sent count == output sent count
	# also only split using whitespace, so the original streusle tokenization is preserved (and hence the number of tokens remains the same)
	( dictStanfordTokens, dictStanfordPOS ) = soton_corenlppy.re.comp_sem_lib.exec_stanford_corenlp(
		dict_text = dictSentences,
		work_dir = os.path.abspath( '.' ),
		annotators = 'tokenize,ssplit,pos',
		option_list = ['-tokenizeOptions','americanize=false','-tokenize.whitespace','true','-ssplit.eolonly','true'],
		num_processes = max_processes,
		dict_openie_config = dict_config )

	'''
	# relic NLTK POS tagging
	dictTaggedSents = soton_corenlppy.common_parse_lib.pos_tag_tokenset_batch( 
						document_token_set = dictSentTokenized,
						lang = 'en',
						dict_common_config = dict_config,
						max_processes = max_processes,
						timeout = 300 )
	'''

	# make IOB with 'O' tags
	listSentIOB = []
	listSortedSentID = sorted( list( dictSentences.keys() ) )
	for strSentID in listSortedSentID :
		treeObjPOS = dictStanfordPOS[strSentID]
		listIOB = []

		#for listPOS in dictTaggedSents[nSentIndex][0] :
		#	listIOB.append( ( listPOS[0], listPOS[1], 'O' ) )
		for nToken in range(len(treeObjPOS)) :
			strToken = ' '.join( treeObjPOS[nToken].leaves() )
			strPOS = treeObjPOS[nToken].label()
			listIOB.append( ( strToken, strPOS, 'O' ) )
		listSentIOB.append( listIOB )
	
	# all done
	return listSortedSentID, listSentIOB
Source code for soton_corenlppy.re.dataset_support_lib

soton_corenlppy

Navigation

Related Topics