-
Notifications
You must be signed in to change notification settings - Fork 24
Expand file tree
/
Copy pathLoadSentenceClassificationData.m
More file actions
120 lines (107 loc) · 4.82 KB
/
LoadSentenceClassificationData.m
File metadata and controls
120 lines (107 loc) · 4.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
% Want to distribute this code? Have other questions? -> sbowman@stanford.edu
function data = LoadSentenceClassificationData(filename, wordMap, labelMap, hyperParams, fragment, labelSetIndex)
% Load one file of sentence pair data.
if hyperParams.useTrees
typeSig = '-trees';
elseif hyperParams.useLattices
typeSig = '-lats';
else
typeSig = ['-seqs-par' num2str(hyperParams.parensInSequences)];
end
if fragment
% Check whether we already loaded this file
[pathname, filenamePart, ext] = fileparts(filename);
listing = dir([pathname, '/pp-', filenamePart, ext,'-final-', hyperParams.vocabName, typeSig, '*']);
if length(listing) > 0
Log(hyperParams.statlog, ['File ', filename, ' was already processed.']);
return
end
elseif ~hyperParams.ignorePreprocessedFiles
% Check whether we already loaded this file
[pathname, filenamePart, ext] = fileparts(filename);
listing = dir([pathname, '/pp-', filenamePart, ext, '-full-', hyperParams.vocabName, typeSig, '*']);
if length(listing) > 0
Log(hyperParams.statlog, ['File ', filename, ' was already processed. Loading.']);
try
d = load([pathname, '/', listing(1).name],'-mat');
data = d.data;
return
catch
Log(hyperParams.statlog, 'Problem loading preprocessed data. Will reprocess raw file.');
end
end
end
fid = fopen(filename);
C = textscan(fid,'%s','delimiter',sprintf('\n'));
fclose(fid);
% Parse the file
nextItemNo = 1;
maxLine = min(length(C{1}), hyperParams.lineLimit);
% maxLine = min(35, maxLine);
% Initialize the data array
rawData = repmat(struct('label', 0, 'sentenceText', ''), maxLine, 1);
% Which nextItemNo was the last to be included in the last MAT file.
lastSave = 0;
% Iterate over examples
for line = (lastSave + 1):maxLine
if ~isempty(C{1}{line})
splitLine = textscan(C{1}{line}, '%s', 'delimiter', '\t');
splitLine = splitLine{1};
% Skip commented and unlabeled lines
if (splitLine{1}(1) ~= '%') && (splitLine{1}(1) ~= '-') && (size(splitLine, 1) >= 2) && labelMap{labelSetIndex}.isKey(splitLine{1})
rawData(nextItemNo - lastSave).label = [ labelMap{labelSetIndex}(splitLine{1}); labelSetIndex ];
rawData(nextItemNo - lastSave).sentenceText = splitLine{2};
nextItemNo = nextItemNo + 1;
else
disp(['Skipped line: ' C{1}{line}]);
end
end
if (mod(nextItemNo - 1, 10000) == 0 && nextItemNo > 0 && fragment)
message = ['Lines loaded: ', num2str(nextItemNo), '/~', num2str(maxLine)];
Log(hyperParams.statlog, message);
data = ProcessAndSave(rawData, wordMap, lastSave, nextItemNo, filename, hyperParams, fragment, typeSig);
lastSave = nextItemNo - 1;
end
end
if fragment
data = ProcessAndSave(rawData, wordMap, lastSave, nextItemNo, [filename, '-final'], hyperParams, fragment, typeSig);
else
data = ProcessAndSave(rawData, wordMap, lastSave, nextItemNo, [filename, '-full'], hyperParams, fragment, typeSig);
end
end
function [ data ] = ProcessAndSave(rawData, wordMap, lastSave, nextItemNo, filename, hyperParams, fragment, typeSig)
numElements = nextItemNo - (lastSave + 1);
if hyperParams.useTrees
data = repmat(struct('label', 0, 'sentence', Tree()), numElements, 1);
parfor dataInd = 1:numElements
data(dataInd).sentence = Tree.makeTree(rawData(dataInd).sentenceText, wordMap);
data(dataInd).label = rawData(dataInd).label;
end
elseif hyperParams.useLattices
data = repmat(struct('label', 0, 'sentence', Lattice()), numElements, 1);
parfor dataInd = 1:numElements
data(dataInd).sentence = Lattice.makeLattice(rawData(dataInd).sentenceText, wordMap, hyperParams.gpu, hyperParams.gpu && ~hyperParams.largeVocabMode);
data(dataInd).label = rawData(dataInd).label;
end
else
data = repmat(struct('label', 0, 'sentence', Sequence()), numElements, 1);
for dataInd = 1:numElements
data(dataInd).sentence = Sequence.makeSequence(rawData(dataInd).sentenceText, wordMap, ...
hyperParams.parensInSequences, hyperParams.gpu && ~hyperParams.largeVocabMode);
data(dataInd).label = rawData(dataInd).label;
end
end
if ~hyperParams.ignorePreprocessedFiles
[pathname, filenamePart, ext] = fileparts(filename);
nameToSave = [pathname, '/pp-', filenamePart, ext, '-', hyperParams.vocabName, typeSig, '-', num2str(nextItemNo), '.mat'];
listing = dir(nameToSave);
% Double check that a file hasn't been written while we were processing.
if isempty(listing)
try
save(nameToSave, 'data', '-v7.3');
catch
Log(hyperParams.statlog, 'Problem saving.');
end
end
end
end