From 78829b6be16130b949ed99806d889a270c2b16d0 Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Mon, 30 Nov 2020 16:37:20 +0100
Subject: [PATCH 01/22] Fixing some defaults if vocab not specified

---
 fairseq/data/handwriting/alphabet.py              | 15 +++++++++++----
 .../data/handwriting/handwriting_dictionary.py    |  4 ++--
 fairseq/tasks/scribblelens.py                     |  4 +++-
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/fairseq/data/handwriting/alphabet.py b/fairseq/data/handwriting/alphabet.py
index 460276cccb..befc75f0bb 100644
--- a/fairseq/data/handwriting/alphabet.py
+++ b/fairseq/data/handwriting/alphabet.py
@@ -62,9 +62,10 @@ class Alphabet:
     """
     def __init__(self, filename_=None, input_dict=None,
                  translation_dict={'_': ' '},
-                 unk=("@",), blank=("*",), space=(' ', '_')):
+                 unk=("@",), blank=("*",), space=(' ', '_'),
+                 ensure_in_dict_on_no_vocab=None): # option for ensuring chars in case the vocab is not given
 
-        if filename_:
+        if filename_:  # both None and '' will be 'False'
             self.chars = bidict(self.readDictionary(filename_))
             print('Alphabet constructed from', filename_,
                   'size=', len(self.chars))
@@ -73,12 +74,18 @@ def __init__(self, filename_=None, input_dict=None,
             print('Alphabet constructed from dictionnary, '
                   'size=', len(self.chars))
         else:
-            self.chars = bidict({
+            base_special_dict = {
                     k: i
                     for i, chs in enumerate([blank, space, unk])
                     for k in chs
-            })
+            }
+            if ensure_in_dict_on_no_vocab:
+                for c in ensure_in_dict_on_no_vocab:
+                    if c not in base_special_dict:
+                        base_special_dict[c] = len(base_special_dict)
+            self.chars = bidict(base_special_dict)
             print('Alphabet constructed empty')
+        
         for c in unk:
             if c not in self.chars:
                 print('Warning: UNK token', c, 'not in vocab')
diff --git a/fairseq/data/handwriting/handwriting_dictionary.py b/fairseq/data/handwriting/handwriting_dictionary.py
index af23179434..21ace2f5a9 100644
--- a/fairseq/data/handwriting/handwriting_dictionary.py
+++ b/fairseq/data/handwriting/handwriting_dictionary.py
@@ -16,11 +16,11 @@ def __init__(
         ):  #extra_special_symbols=None,):
 
         # [!] bos, pad, eos etc. need to be in dict file
-        super().__init__(alphabet_file, unk=(unk,))
+        super().__init__(alphabet_file, unk=(unk,), ensure_in_dict_on_no_vocab=(bos, pad, eos, unk))
         #self._alphabet = Alphabet(alphabet_file, unk=(unk,))  
         for c, descr in zip((bos, pad, eos, unk), ("bos", "pad", "eos", "unk")):
             if not self.existDict(c):
-                print('WARNING:', descr, 'token', c, 'not in vocab')
+                print('ERROR:', descr, 'token', c, 'not in vocab and vocab chosen, not constructed')
         self.bos_char, self.unk_char, self.pad_char, self.eos_char = bos, unk, pad, eos
         #self.symbols = []
         #self.count = []
diff --git a/fairseq/tasks/scribblelens.py b/fairseq/tasks/scribblelens.py
index cd910e2d3b..00b085fc92 100644
--- a/fairseq/tasks/scribblelens.py
+++ b/fairseq/tasks/scribblelens.py
@@ -98,7 +98,9 @@ def load_dataset(self, split, **kwargs):
             split (str): name of the split (e.g., train, valid, test)
         """
 
-        vocab_path = self.args.vocab_path if self.args.vocab_path is not None else self.args.data + '/tasman.alphabet.plus.space.mode5.json'
+        vocab_path = self.args.vocab_path if self.args.vocab_path is not None else ''  
+        # [now file in default location not used, needs to be specified, otherwise trying to construct vocab from scratch] 
+        # self.args.data + '/tasman.alphabet.plus.space.mode5.json'
 
         if not self.args.labels:
             self.datasets[split] = FileHandwritingDataset(

From 2c71d2c09058f50022bee7a30f352c1e666e2a31 Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Mon, 30 Nov 2020 18:56:22 +0100
Subject: [PATCH 02/22] 'Hierarchical variance segmentation' initial version,
 without torch function using that to work on tensors

---
 fairseq/modules/__init__.py                   |  2 +
 fairseq/modules/segmentation/__init__.py      |  0
 .../hierarchical_variance_segmentation.py     | 90 +++++++++++++++++++
 fairseq/modules/segmentation/segment_dict.py  | 72 +++++++++++++++
 4 files changed, 164 insertions(+)
 create mode 100644 fairseq/modules/segmentation/__init__.py
 create mode 100644 fairseq/modules/segmentation/hierarchical_variance_segmentation.py
 create mode 100644 fairseq/modules/segmentation/segment_dict.py

diff --git a/fairseq/modules/__init__.py b/fairseq/modules/__init__.py
index e2326ac6e3..9991681542 100644
--- a/fairseq/modules/__init__.py
+++ b/fairseq/modules/__init__.py
@@ -35,6 +35,7 @@
 from .unfold import unfold1d
 from .transformer_layer import TransformerDecoderLayer, TransformerEncoderLayer
 from .vggblock import VGGBlock
+from .segmentation import hierarchicalVarianceSegmentation
 
 __all__ = [
     "AdaptiveInput",
@@ -72,5 +73,6 @@
     "TransformerEncoderLayer",
     "TransposeLast",
     "VGGBlock",
+    'hierarchicalVarianceSegmentation',
     "unfold1d",
 ]
diff --git a/fairseq/modules/segmentation/__init__.py b/fairseq/modules/segmentation/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
new file mode 100644
index 0000000000..59ca1d36fe
--- /dev/null
+++ b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
@@ -0,0 +1,90 @@
+
+import numpy as np
+from .segment_dict import *
+from heapq import *
+
+
+def variance(linearSum, squaresSum, size):
+    return np.sum((squaresSum / size) - np.square(linearSum / size))  # sum of "variance vector"
+
+
+# lines is a tensor or array of tensors?
+def hierarchicalVarianceSegmentation(k, linesGPU):
+    
+    # tensor to CPU  (don't really need copy, will just need to put tensors in segmentsDict)
+    lines = linesGPU.detach().to('cpu').numpy()  
+    # https://discuss.pytorch.org/t/cant-convert-cuda-tensor-to-numpy-use-tensor-cpu-to-copy-the-tensor-to-host-memory-first/38301 ,
+    # https://discuss.pytorch.org/t/what-is-the-cpu-in-pytorch/15007/3
+    
+   # TODO check if tensor parts correctly taken etc. [!]
+    segmentsDict = SegmentDict(lines)
+    
+    # maybe will need to change this to arrays or so instead of dicts for efficiency
+    
+    # q for ranges to merge
+    q = []
+    
+    # every pair added only one time; after merges will need to add both to right and to left
+    for segm in segmentsDict.getSegments():
+        segmRight = segmentsDict.getSegmentRight(segm)
+        if segmRight is not None:
+            linSum1, sqSum1 = segmentsDict.getSegmentSums(segm)
+            linSum2, sqSum2 = segmentsDict.getSegmentSums(segmRight)
+            line1, left1, right1 = segm
+            line2, left2, right2 = segmRight
+            oldVar1 = variance(linSum1, sqSum1, right1 - left1 + 1)
+            oldVar2 = variance(linSum2, sqSum2, right2 - left2 + 1)
+            mergedVariance = variance(linSum1 + linSum2, sqSum1 + sqSum2, right2 - left1 + 1)
+            heappush(q, (mergedVariance - oldVar1 - oldVar2, segm, segmRight))
+       
+    varChanges = []
+    merges = []
+    
+    while len(q):
+    
+        varChange, left, right = heappop(q)
+        merged = segmentsDict.mergeSegments(left, right)  # checks if merge is valid
+        
+        if merged is None:  # old merge possibility, now impossible
+            continue
+        
+        varChanges.append(varChange)
+        merges.append((left, right))
+        
+        toLeft = segmentsDict.getSegmentLeft(merged)
+        toRight = segmentsDict.getSegmentRight(merged)
+        linSumMerged, sqSumMerged = segmentsDict.getSegmentSums(merged)
+        lineMerged, leftMerged, rightMerged = merged
+        varMerged = variance(linSumMerged, sqSumMerged, rightMerged - leftMerged + 1)
+        
+        if toLeft is not None:
+            linSum2, sqSum2 = segmentsDict.getSegmentSums(toLeft)
+            line2, left2, right2 = toLeft
+            oldVar2 = variance(linSum2, sqSum2, right2 - left2 + 1)
+            mergedVariance = variance(linSumMerged + linSum2, sqSumMerged + sqSum2, rightMerged - left2 + 1)
+            heappush(q, (mergedVariance - varMerged - oldVar2, toLeft, merged))
+            
+        if toRight is not None:
+            linSum2, sqSum2 = segmentsDict.getSegmentSums(toRight)
+            line2, left2, right2 = toRight
+            oldVar2 = variance(linSum2, sqSum2, right2 - left2 + 1)
+            mergedVariance = variance(linSumMerged + linSum2, sqSumMerged + sqSum2, right2 - leftMerged + 1)
+            heappush(q, (mergedVariance - varMerged - oldVar2, merged, toRight))
+            
+    return varChanges, merges
+
+
+
+if __name__ == '__main__':
+    # import ptvsd
+    # ptvsd.enable_attach(('0.0.0.0', 7309))
+    # print("Attach debugger now")
+    # ptvsd.wait_for_attach()
+
+    # run from .. with python -m segmentation.hierarchical_variance_segmentation
+
+    import torch
+
+    tensor = torch.tensor([[[1,2],[1,2],[3,4],[3,4],[3,4],[8,9]], [[1,2],[1,2],[3,4],[3,4],[3,4],[8,9]]], dtype=torch.float64)
+    print(tensor[0][1])
+    print(hierarchicalVarianceSegmentation(7, tensor))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
\ No newline at end of file
diff --git a/fairseq/modules/segmentation/segment_dict.py b/fairseq/modules/segmentation/segment_dict.py
new file mode 100644
index 0000000000..9bde005fcd
--- /dev/null
+++ b/fairseq/modules/segmentation/segment_dict.py
@@ -0,0 +1,72 @@
+
+import numpy as np
+
+class SegmentDict:
+    
+    def __init__(self, lines):  # lines assumed to be of shape [#lines x line_len * k]
+        # (line#, place in line): (begin in line, end in line, sum(x), sum(x^2)) ; sums are possibly vectors
+        self._dct = {(i, j): (j, j, lines[i][j], np.square(lines[i][j])) for i in range(len(lines)) for j in range(len(lines[i]))}
+        
+    # there is a 'segment' implicit format (tuple) used: (line#, leftIndex(begin), rightIndex(end))
+        
+    def segmentInDict(self, segment):
+        line, leftIdx, rightIdx = segment
+        if (line, leftIdx) not in self._dct:
+            return False
+        _, rightIdxFromDict, _, _ = self._dct[(line, leftIdx)]
+        return rightIdx == rightIdxFromDict  # left already checked by key
+        
+    def removeSegment(self, segment):
+        line, leftIdx, rightIdx = segment
+        if (line, leftIdx) in self._dct:
+            del self._dct[(line, leftIdx)]
+        if (line, rightIdx) in self._dct:
+            del self._dct[(line, rightIdx)]
+            
+    def mergeSegments(self, segment1, segment2):
+        line1, left1, right1 = segment1
+        line2, left2, right2 = segment2
+        if not self.segmentInDict(segment1) or not self.segmentInDict(segment2) \
+           or line1 != line2 or right1 + 1 != left2:  # not subsequent
+            return None
+        linearSum1, squaresSum1 = self.getSegmentSums(segment1)
+        linearSum2, squaresSum2 = self.getSegmentSums(segment2)
+        # remove old segments
+        self.removeSegment(segment1)
+        self.removeSegment(segment2)
+        # add a new merged one
+        self._dct[(line1, left1)] = (left1, right2, linearSum1 + linearSum2, squaresSum1 + squaresSum2)
+        self._dct[(line1, right2)] = (left1, right2, linearSum1 + linearSum2, squaresSum1 + squaresSum2)
+        return (line1, left1, right2)
+            
+    def getSegments(self):
+        res = []
+        for (line, leftIdx) in self._dct.keys():
+            begin, end, _, _ = self._dct[(line, leftIdx)]
+            res.append((line, begin, end))
+        return res
+        
+    def getSegmentLeft(self, segment):
+        if not self.segmentInDict(segment):
+            return None
+        line, left, right = segment
+        if (line, left - 1) not in self._dct:
+            return None
+        segmLeft, segmRight, _, _ = self._dct[(line, left - 1)]
+        return (line, segmLeft, segmRight)
+    
+    def getSegmentRight(self, segment):
+        if not self.segmentInDict(segment):
+            return None
+        line, left, right = segment
+        if (line, right + 1) not in self._dct:
+            return None
+        segmLeft, segmRight, _, _ = self._dct[(line, right + 1)]
+        return (line, segmLeft, segmRight)
+    
+    def getSegmentSums(self, segment):
+        if not self.segmentInDict(segment):
+            return None
+        line, left, _ = segment
+        _, _, linearSum, squaresSum = self._dct[(line, left)]
+        return (linearSum, squaresSum)
\ No newline at end of file

From 67890ad4c121a57a5b6dc68f518f6f7c7fe3d9ea Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Tue, 1 Dec 2020 14:01:55 +0100
Subject: [PATCH 03/22] option for merging until desired num of segments

---
 .../hierarchical_variance_segmentation.py          |  6 +++---
 fairseq/modules/segmentation/segment_dict.py       | 14 ++++++++++++--
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
index 59ca1d36fe..24125f64ec 100644
--- a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
+++ b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
@@ -9,7 +9,7 @@ def variance(linearSum, squaresSum, size):
 
 
 # lines is a tensor or array of tensors?
-def hierarchicalVarianceSegmentation(k, linesGPU):
+def hierarchicalVarianceSegmentation(linesGPU, k=None):  # k is per line (total num of segments to be made is k*numLines)
     
     # tensor to CPU  (don't really need copy, will just need to put tensors in segmentsDict)
     lines = linesGPU.detach().to('cpu').numpy()  
@@ -40,7 +40,7 @@ def hierarchicalVarianceSegmentation(k, linesGPU):
     varChanges = []
     merges = []
     
-    while len(q):
+    while len(q) and (k is None or segmentsDict.numSegments() > k * lines.shape[0]):
     
         varChange, left, right = heappop(q)
         merged = segmentsDict.mergeSegments(left, right)  # checks if merge is valid
@@ -87,4 +87,4 @@ def hierarchicalVarianceSegmentation(k, linesGPU):
 
     tensor = torch.tensor([[[1,2],[1,2],[3,4],[3,4],[3,4],[8,9]], [[1,2],[1,2],[3,4],[3,4],[3,4],[8,9]]], dtype=torch.float64)
     print(tensor[0][1])
-    print(hierarchicalVarianceSegmentation(7, tensor))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
\ No newline at end of file
+    print(hierarchicalVarianceSegmentation(tensor, 2))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
\ No newline at end of file
diff --git a/fairseq/modules/segmentation/segment_dict.py b/fairseq/modules/segmentation/segment_dict.py
index 9bde005fcd..f3a1e5c835 100644
--- a/fairseq/modules/segmentation/segment_dict.py
+++ b/fairseq/modules/segmentation/segment_dict.py
@@ -6,9 +6,13 @@ class SegmentDict:
     def __init__(self, lines):  # lines assumed to be of shape [#lines x line_len * k]
         # (line#, place in line): (begin in line, end in line, sum(x), sum(x^2)) ; sums are possibly vectors
         self._dct = {(i, j): (j, j, lines[i][j], np.square(lines[i][j])) for i in range(len(lines)) for j in range(len(lines[i]))}
+        self._size = len(self._dct)  # sometimes 1, sometimes 2 netries per segment - better keep a counter
         
     # there is a 'segment' implicit format (tuple) used: (line#, leftIndex(begin), rightIndex(end))
         
+    def numSegments(self):
+        return self._size
+    
     def segmentInDict(self, segment):
         line, leftIdx, rightIdx = segment
         if (line, leftIdx) not in self._dct:
@@ -18,10 +22,15 @@ def segmentInDict(self, segment):
         
     def removeSegment(self, segment):
         line, leftIdx, rightIdx = segment
+        wasThere = False
         if (line, leftIdx) in self._dct:
             del self._dct[(line, leftIdx)]
+            wasThere = True
         if (line, rightIdx) in self._dct:
             del self._dct[(line, rightIdx)]
+            wasThere = True
+        if wasThere:
+            self._size -= 1
             
     def mergeSegments(self, segment1, segment2):
         line1, left1, right1 = segment1
@@ -31,12 +40,13 @@ def mergeSegments(self, segment1, segment2):
             return None
         linearSum1, squaresSum1 = self.getSegmentSums(segment1)
         linearSum2, squaresSum2 = self.getSegmentSums(segment2)
-        # remove old segments
+        # remove old segments; will update _size
         self.removeSegment(segment1)
         self.removeSegment(segment2)
-        # add a new merged one
+        # add a new merged one; need to update _size by hand
         self._dct[(line1, left1)] = (left1, right2, linearSum1 + linearSum2, squaresSum1 + squaresSum2)
         self._dct[(line1, right2)] = (left1, right2, linearSum1 + linearSum2, squaresSum1 + squaresSum2)
+        self._size += 1
         return (line1, left1, right2)
             
     def getSegments(self):

From 5dc5500e927ae7b758e40f910ccf872878413e60 Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Tue, 1 Dec 2020 15:56:03 +0100
Subject: [PATCH 04/22] part of torch segmentation layer

---
 .../hierarchical_variance_segmentation.py     | 74 +++++++++++++++++--
 fairseq/modules/segmentation/segment_dict.py  | 31 +++++++-
 2 files changed, 97 insertions(+), 8 deletions(-)

diff --git a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
index 24125f64ec..5bc3461f28 100644
--- a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
+++ b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
@@ -2,19 +2,15 @@
 import numpy as np
 from .segment_dict import *
 from heapq import *
-
+from torch.autograd import Function
 
 def variance(linearSum, squaresSum, size):
     return np.sum((squaresSum / size) - np.square(linearSum / size))  # sum of "variance vector"
 
 
 # lines is a tensor or array of tensors?
-def hierarchicalVarianceSegmentation(linesGPU, k=None):  # k is per line (total num of segments to be made is k*numLines)
-    
-    # tensor to CPU  (don't really need copy, will just need to put tensors in segmentsDict)
-    lines = linesGPU.detach().to('cpu').numpy()  
-    # https://discuss.pytorch.org/t/cant-convert-cuda-tensor-to-numpy-use-tensor-cpu-to-copy-the-tensor-to-host-memory-first/38301 ,
-    # https://discuss.pytorch.org/t/what-is-the-cpu-in-pytorch/15007/3
+# won't modify lines; assuming lines in on CPU if a tensor
+def hierarchicalVarianceSegmentation(lines, k=None):  # k is per line (total num of segments to be made is k*numLines)
     
    # TODO check if tensor parts correctly taken etc. [!]
     segmentsDict = SegmentDict(lines)
@@ -73,6 +69,70 @@ def hierarchicalVarianceSegmentation(linesGPU, k=None):  # k is per line (total
             
     return varChanges, merges
 
+class HierarchicalVarianceSegmentationLayer(Function):
+
+    @staticmethod
+    def flatten(x):
+        s = x.shape()
+        if len(s) < 3:
+            return x
+        if len(s) == 3:
+            return x.view(-1, s[2])
+        assert False
+
+    # perhaps that ^ is not needed, and restore_shapes also
+
+    @staticmethod
+    def forward(ctx, inputGPU, k=None, allowKrange=None):  # k for strict num of segments, allowKrange for range and choosing 'best' split point
+
+        assert k is None or allowKrange is None  # mutually exclusive options
+
+        # tensor to CPU  (don't really need copy, will just need to put tensors in segmentsDict)
+        input = inputGPU.detach().to('cpu').numpy()  
+        # https://discuss.pytorch.org/t/cant-convert-cuda-tensor-to-numpy-use-tensor-cpu-to-copy-the-tensor-to-host-memory-first/38301 ,
+        # https://discuss.pytorch.org/t/what-is-the-cpu-in-pytorch/15007/3
+
+        varChanges, merges = hierarchicalVarianceSegmentation(input, k=k)  # won't modify input
+        if allowKrange:  # full merge done above, k=None
+            begin, end = allowKrange
+            assert begin <= end
+            beginIdx = len(varChanges) - end  # max allowed num of segments, smallest num of merges
+            endIdx = len(varChanges) - begin  # min allowed num of segments, biggest num of merges
+            prefSums = []
+            s = 0.
+            for chng in varChanges:
+                s += chng
+                prefSums.append(chng)
+            best = -1
+            where = -1
+            for i in range(beginIdx, min(endIdx+1, len(varChanges))):
+                sufSum = s - prefSums[i]  # sum after this index
+                prefSum = prefSums[i] if prefSums[i] > 0. else 1.  # don't div by 0
+                # v the bigger the better split point; suffix div by prefix averages of variance change
+                here = (sufSum / (len(varChanges)-i))  /  (prefSum / (i+1.))  
+                if here > best:
+                    best = here
+                    where = i
+            if where == -1:
+                print("WARNING: problems choosing best num segments")
+                where = int((beginIdx + endIdx) // 2)
+            varChanges = varChanges[:where+1]  # this one is not really needed
+            merges = merges[:where+1]
+            
+        # now need to actually perform merge on tensor and later TODO bring it back to what device it was on (read at the beginning?)
+        # some union find or something? actually, can make it simpler, only need to see last merge for every place; just fill some 'merged' tensor with ID of segment
+        # but will also need to put that in order; maybe just sort segment tuples for that later
+        finalSegments, segmentNumsInLines = SegmentDict.getFinalSegments(merges)
+
+        ctx.save_for_backward(finalSegments, segmentNumsInLines)
+        ctx.mark_non_differentiable(finalSegments, segmentNumsInLines)
+
+        # TODO perform actual averaging and return as the 1st argument; 
+        # TODO this will need padding somehow...; 
+        # TODO actually, this also need to get a padding mask and ignore padded stuff; 
+        # TODO output its padding mask!
+
+        return [], finalSegments, segmentNumsInLines
 
 
 if __name__ == '__main__':
diff --git a/fairseq/modules/segmentation/segment_dict.py b/fairseq/modules/segmentation/segment_dict.py
index f3a1e5c835..2a6a18049d 100644
--- a/fairseq/modules/segmentation/segment_dict.py
+++ b/fairseq/modules/segmentation/segment_dict.py
@@ -79,4 +79,33 @@ def getSegmentSums(self, segment):
             return None
         line, left, _ = segment
         _, _, linearSum, squaresSum = self._dct[(line, left)]
-        return (linearSum, squaresSum)
\ No newline at end of file
+        return (linearSum, squaresSum)
+
+    @staticmethod
+    def getFinalSegments(merges, inputShape):
+
+        visited = np.zeros(inputShape, dt=np.int32)
+        finalSegments = []
+        for i in range(len(merges)-1,-1,-1):
+            leftSegm, rightSegm = merges[i]
+            line, beginLeft, endLeft = leftSegm
+            if visited[line][beginLeft] != 0:
+                continue  # merge already seen
+            _, beginRight, endRight = rightSegm
+            finalSegments.append((line, beginLeft, endRight))
+            visited[line][beginLeft:(endRight+1)] = 1
+
+        lineCounter = 0
+        prevLine = 1
+        res = {}  # {(line, #ofSegmentInLine): (line, beginIdx, endIdx)}
+        segmentsInLines = []  # numbers of segments in lines
+        for line, begin, end in sorted(finalSegments):
+            if line != prevLine:
+                prevLine = line
+                segmentsInLines.append(lineCounter)
+                lineCounter = 0
+            res[(line, lineCounter)] = (line, begin, end)
+            lineCounter += 1
+        segmentsInLines.append(lineCounter)
+
+        return res, segmentsInLines  # there will be always at least 1 segment in a line
\ No newline at end of file

From e035877baea28c8937ea7c63c0c7a1140db25ef6 Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Tue, 1 Dec 2020 18:57:12 +0100
Subject: [PATCH 05/22] initial untested version of torch segmentation layer

---
 .../hierarchical_variance_segmentation.py     | 50 +++++++++++++------
 fairseq/modules/segmentation/segment_dict.py  |  6 +--
 2 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
index 5bc3461f28..1395a79f7e 100644
--- a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
+++ b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
@@ -10,10 +10,10 @@ def variance(linearSum, squaresSum, size):
 
 # lines is a tensor or array of tensors?
 # won't modify lines; assuming lines in on CPU if a tensor
-def hierarchicalVarianceSegmentation(lines, k=None):  # k is per line (total num of segments to be made is k*numLines)
+def hierarchicalVarianceSegmentation(lines, padMask=None, k=None):  # k is per line (total num of segments to be made is k*numLines)
     
    # TODO check if tensor parts correctly taken etc. [!]
-    segmentsDict = SegmentDict(lines)
+    segmentsDict = SegmentDict(lines, padMask=padMask)
     
     # maybe will need to change this to arrays or so instead of dicts for efficiency
     
@@ -83,16 +83,21 @@ def flatten(x):
     # perhaps that ^ is not needed, and restore_shapes also
 
     @staticmethod
-    def forward(ctx, inputGPU, k=None, allowKrange=None):  # k for strict num of segments, allowKrange for range and choosing 'best' split point
+    def forward(ctx, inputGPU, padMask=None, k=None, allowKrange=None):  # k for strict num of segments, allowKrange for range and choosing 'best' split point
 
         assert k is None or allowKrange is None  # mutually exclusive options
 
+        # TODO if input only 2-dim, add another dimension possibly (W x H -> 1 x W x H, consistent with B x W x H - later assuming that in some places)
+
+        wasInputOnGPU = inputGPU.is_cuda
+        wasPadMaskOnGPU = padMask.is_cuda if padMask is not None else False
+
         # tensor to CPU  (don't really need copy, will just need to put tensors in segmentsDict)
         input = inputGPU.detach().to('cpu').numpy()  
         # https://discuss.pytorch.org/t/cant-convert-cuda-tensor-to-numpy-use-tensor-cpu-to-copy-the-tensor-to-host-memory-first/38301 ,
         # https://discuss.pytorch.org/t/what-is-the-cpu-in-pytorch/15007/3
 
-        varChanges, merges = hierarchicalVarianceSegmentation(input, k=k)  # won't modify input
+        varChanges, merges = hierarchicalVarianceSegmentation(input, padMask=padMask, k=k)  # won't modify input
         if allowKrange:  # full merge done above, k=None
             begin, end = allowKrange
             assert begin <= end
@@ -119,20 +124,37 @@ def forward(ctx, inputGPU, k=None, allowKrange=None):  # k for strict num of seg
             varChanges = varChanges[:where+1]  # this one is not really needed
             merges = merges[:where+1]
             
-        # now need to actually perform merge on tensor and later TODO bring it back to what device it was on (read at the beginning?)
-        # some union find or something? actually, can make it simpler, only need to see last merge for every place; just fill some 'merged' tensor with ID of segment
-        # but will also need to put that in order; maybe just sort segment tuples for that later
         finalSegments, segmentNumsInLines = SegmentDict.getFinalSegments(merges)
 
-        ctx.save_for_backward(finalSegments, segmentNumsInLines)
-        ctx.mark_non_differentiable(finalSegments, segmentNumsInLines)
+        maxSegments = max(segmentNumsInLines)
+        paddingMaskOut = np.full((input.shape[0], maxSegments), False)  #torch.BoolTensor(size=(input.shape[0], maxSegments)).fill_(False)
+        for i, n in enumerate(segmentNumsInLines):
+            paddingMaskOut[i][n:] = True
+        
+        segmented = np.full((input.shape[0], maxSegments, input.shape[2]), 0.)  #torch.tensor(size=(input.shape[0], maxSegments, input.shape[2])).fill_(0.)
+        for line, idxInLine in finalSegments.keySet():
+            line, begin, end = finalSegments[(line, idxInLine)]
+            segmented[line][idxInLine] = np.avg(input[line][begin:(end+1)])  #torch.mean(input[line][begin:(end+1)])
+
+        resOutput = torch.tensor(segmented).to('cuda') if wasInputOnGPU else torch.tensor(segmented)
+        resPadMask = torch.BoolTensor(paddingMaskOut).to('cuda') if wasPadMaskOnGPU else torch.BoolTensor(paddingMaskOut)
+
+        ctx.save_for_backward(finalSegments, segmentNumsInLines, paddingMask, paddingMaskOut, input.shape)
+        ctx.mark_non_differentiable(resPadMask, finalSegments, segmentNumsInLines)  # TODO those/some are not tensors, not sure if should do that
+
+        return resOutput, resPadMask, finalSegments, segmentNumsInLines
+
+    @staticmethod
+    def backward(ctx, dxThrough, outPadMask=None, finalSegments=None, segmentNumsInLines=None):
+
+        finalSegments, segmentNumsInLines, paddingMask, paddingMaskOut, inputShape = ctx.saved_tensors
 
-        # TODO perform actual averaging and return as the 1st argument; 
-        # TODO this will need padding somehow...; 
-        # TODO actually, this also need to get a padding mask and ignore padded stuff; 
-        # TODO output its padding mask!
+        dx = torch.tensor(size=inputShape).fill_(0.)
+        for line, idxInLine in finalSegments.keySet():
+            line, begin, end = finalSegments[(line, idxInLine)]
+            dx[line][begin:(end+1)] = dxThrough[line][idxInLine] / (end - begin + 1)
 
-        return [], finalSegments, segmentNumsInLines
+        return dx, None, None, None
 
 
 if __name__ == '__main__':
diff --git a/fairseq/modules/segmentation/segment_dict.py b/fairseq/modules/segmentation/segment_dict.py
index 2a6a18049d..b9bcad4ca8 100644
--- a/fairseq/modules/segmentation/segment_dict.py
+++ b/fairseq/modules/segmentation/segment_dict.py
@@ -3,10 +3,10 @@
 
 class SegmentDict:
     
-    def __init__(self, lines):  # lines assumed to be of shape [#lines x line_len * k]
+    def __init__(self, lines, padMask=None):  # lines assumed to be of shape [#lines x line_len * k]
         # (line#, place in line): (begin in line, end in line, sum(x), sum(x^2)) ; sums are possibly vectors
-        self._dct = {(i, j): (j, j, lines[i][j], np.square(lines[i][j])) for i in range(len(lines)) for j in range(len(lines[i]))}
-        self._size = len(self._dct)  # sometimes 1, sometimes 2 netries per segment - better keep a counter
+        self._dct = {(i, j): (j, j, lines[i][j], np.square(lines[i][j])) for i in range(len(lines)) for j in range(len(lines[i])) if padMask is None or not padMask[i][j]}
+        self._size = len(self._dct)  # sometimes 1 (now all segments have 1 entry), sometimes later 2 entries per segment - better keep a counter
         
     # there is a 'segment' implicit format (tuple) used: (line#, leftIndex(begin), rightIndex(end))
         

From 400abc84ff9c9a3787b44c0f6e7f0c0b470b604a Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Tue, 1 Dec 2020 21:19:28 +0100
Subject: [PATCH 06/22] some fixes and testing to initial hierarchical variance
 clustering version

---
 .../hierarchical_variance_segmentation.py     | 83 +++++++++++++------
 fairseq/modules/segmentation/segment_dict.py  | 12 ++-
 2 files changed, 67 insertions(+), 28 deletions(-)

diff --git a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
index 1395a79f7e..59d6009c23 100644
--- a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
+++ b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
@@ -2,14 +2,13 @@
 import numpy as np
 from .segment_dict import *
 from heapq import *
-from torch.autograd import Function
+from torch.autograd import Function, Variable
 
 def variance(linearSum, squaresSum, size):
     return np.sum((squaresSum / size) - np.square(linearSum / size))  # sum of "variance vector"
 
 
-# lines is a tensor or array of tensors?
-# won't modify lines; assuming lines in on CPU if a tensor
+# [!] lines has to be a numpy array, np.sum() crashes if done on tensor
 def hierarchicalVarianceSegmentation(lines, padMask=None, k=None):  # k is per line (total num of segments to be made is k*numLines)
     
    # TODO check if tensor parts correctly taken etc. [!]
@@ -83,9 +82,9 @@ def flatten(x):
     # perhaps that ^ is not needed, and restore_shapes also
 
     @staticmethod
-    def forward(ctx, inputGPU, padMask=None, k=None, allowKrange=None):  # k for strict num of segments, allowKrange for range and choosing 'best' split point
+    def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None):  # k for strict num of segments, allowKsumRange for range OF SUM OF SEGMENTS IN ALL LINES and choosing 'best' split point
 
-        assert k is None or allowKrange is None  # mutually exclusive options
+        assert k is None or allowKsumRange is None  # mutually exclusive options
 
         # TODO if input only 2-dim, add another dimension possibly (W x H -> 1 x W x H, consistent with B x W x H - later assuming that in some places)
 
@@ -98,23 +97,28 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKrange=None):  # k for str
         # https://discuss.pytorch.org/t/what-is-the-cpu-in-pytorch/15007/3
 
         varChanges, merges = hierarchicalVarianceSegmentation(input, padMask=padMask, k=k)  # won't modify input
-        if allowKrange:  # full merge done above, k=None
-            begin, end = allowKrange
+        print("MERGES0: ", merges)
+        if allowKsumRange:  # full merge done above, k=None
+            begin, end = allowKsumRange
             assert begin <= end
-            beginIdx = len(varChanges) - end  # max allowed num of segments, smallest num of merges
-            endIdx = len(varChanges) - begin  # min allowed num of segments, biggest num of merges
+            beginIdx = input.shape[0] - 1 + len(varChanges) - end  # max allowed num of segments, smallest num of merges; input.shape[0] is num of segments if all merges done
+            endIdx = input.shape[0] - 1 + len(varChanges) - begin  # min allowed num of segments, biggest num of merges; input.shape[0] is num of segments if all merges done
+            print("::::::::::", beginIdx, endIdx)
             prefSums = []
             s = 0.
             for chng in varChanges:
                 s += chng
-                prefSums.append(chng)
+                prefSums.append(s)
             best = -1
             where = -1
+            print("PREFSUMS: ", prefSums)
             for i in range(beginIdx, min(endIdx+1, len(varChanges))):
                 sufSum = s - prefSums[i]  # sum after this index
-                prefSum = prefSums[i] if prefSums[i] > 0. else 1.  # don't div by 0
+                prefSum = prefSums[i] if prefSums[i] > 0. else .0000001  # don't div by 0
                 # v the bigger the better split point; suffix div by prefix averages of variance change
                 here = (sufSum / (len(varChanges)-i))  /  (prefSum / (i+1.))  
+                print("!", i, ":", prefSum ,sufSum, here)
+                
                 if here > best:
                     best = here
                     where = i
@@ -124,7 +128,9 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKrange=None):  # k for str
             varChanges = varChanges[:where+1]  # this one is not really needed
             merges = merges[:where+1]
             
-        finalSegments, segmentNumsInLines = SegmentDict.getFinalSegments(merges)
+        finalSegments, segmentNumsInLines = SegmentDict.getFinalSegments(merges, input.shape[:2], padMask=padMask)
+        print("MERGES: ", merges)
+        print("FINAL SEGMENTS: ", finalSegments)
 
         maxSegments = max(segmentNumsInLines)
         paddingMaskOut = np.full((input.shape[0], maxSegments), False)  #torch.BoolTensor(size=(input.shape[0], maxSegments)).fill_(False)
@@ -132,28 +138,43 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKrange=None):  # k for str
             paddingMaskOut[i][n:] = True
         
         segmented = np.full((input.shape[0], maxSegments, input.shape[2]), 0.)  #torch.tensor(size=(input.shape[0], maxSegments, input.shape[2])).fill_(0.)
-        for line, idxInLine in finalSegments.keySet():
+        for line, idxInLine in finalSegments.keys():
             line, begin, end = finalSegments[(line, idxInLine)]
-            segmented[line][idxInLine] = np.avg(input[line][begin:(end+1)])  #torch.mean(input[line][begin:(end+1)])
+            segmented[line][idxInLine] = np.mean(input[line][begin:(end+1)], axis=0)  #torch.mean(input[line][begin:(end+1)])
 
-        resOutput = torch.tensor(segmented).to('cuda') if wasInputOnGPU else torch.tensor(segmented)
+        resOutput = torch.tensor(segmented).to('cuda') if wasInputOnGPU else torch.tensor(segmented)  #.requires_grad_(True)
         resPadMask = torch.BoolTensor(paddingMaskOut).to('cuda') if wasPadMaskOnGPU else torch.BoolTensor(paddingMaskOut)
 
-        ctx.save_for_backward(finalSegments, segmentNumsInLines, paddingMask, paddingMaskOut, input.shape)
-        ctx.mark_non_differentiable(resPadMask, finalSegments, segmentNumsInLines)  # TODO those/some are not tensors, not sure if should do that
+        print("********************", dir(ctx))
+        ctx.save_for_backward(padMask, resPadMask)
+        # save_for_backward is only for tensors / variables / stuff
+        ctx.finalSegments = finalSegments
+        ctx.segmentNumsInLines = segmentNumsInLines
+        ctx.inputShape = input.shape
+        ctx.mark_non_differentiable(resPadMask)  # can only pass torch variables here and only that makes sense
 
-        return resOutput, resPadMask, finalSegments, segmentNumsInLines
+        print("FINAL SEGMENTS: ", finalSegments, segmentNumsInLines)
+        return resOutput, resPadMask  #, finalSegments, segmentNumsInLines can only return torch variables... TODO maybe check how to fetch this info, but not sure if needed
 
     @staticmethod
-    def backward(ctx, dxThrough, outPadMask=None, finalSegments=None, segmentNumsInLines=None):
+    def backward(ctx, dxThrough, outPadMask=None):  #, finalSegments=None, segmentNumsInLines=None):
 
-        finalSegments, segmentNumsInLines, paddingMask, paddingMaskOut, inputShape = ctx.saved_tensors
+        print("AAAA")
 
-        dx = torch.tensor(size=inputShape).fill_(0.)
-        for line, idxInLine in finalSegments.keySet():
-            line, begin, end = finalSegments[(line, idxInLine)]
+        paddingMask, paddingMaskOut = ctx.saved_tensors
+
+        print("BBBB")
+
+        dx = torch.empty(size=ctx.inputShape).fill_(0.)
+
+        print("BBBB2")
+
+        for line, idxInLine in ctx.finalSegments.keys():
+            line, begin, end = ctx.finalSegments[(line, idxInLine)]
             dx[line][begin:(end+1)] = dxThrough[line][idxInLine] / (end - begin + 1)
 
+        print("CCCC")
+
         return dx, None, None, None
 
 
@@ -167,6 +188,18 @@ def backward(ctx, dxThrough, outPadMask=None, finalSegments=None, segmentNumsInL
 
     import torch
 
-    tensor = torch.tensor([[[1,2],[1,2],[3,4],[3,4],[3,4],[8,9]], [[1,2],[1,2],[3,4],[3,4],[3,4],[8,9]]], dtype=torch.float64)
+    tensor = torch.tensor([[[1,2],[1,2],[3,4],[3,4],[3,4],[8,9]], [[1,2],[1,2],[3,4],[3,4],[3,4],[8,9]]], dtype=torch.float64).requires_grad_(True)
     print(tensor[0][1])
-    print(hierarchicalVarianceSegmentation(tensor, 2))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
\ No newline at end of file
+    print(hierarchicalVarianceSegmentation(tensor.detach().numpy(), padMask=None, k=2))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
+    print(hierarchicalVarianceSegmentation(tensor.detach().numpy(), padMask=None, k=1))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
+
+    print("-------------------------- torch ---------------------------")
+    # (tensor, padMask, k, kSumRange)
+    resOutput, resPadMask = HierarchicalVarianceSegmentationLayer.apply(tensor, None, None, (2,5))  #(2, 5))  # can;t have keyword args for torch Functions...
+    print(resOutput)
+    print(resPadMask)
+    #print(finalSegments)
+    #print(segmentNumsInLines)
+    #loss = Variable(resOutput, requires_grad=True)
+    resOutput.sum().backward()  # .backward() needs loss to be a number (tensor of size (1,))
+    print(tensor.grad)
\ No newline at end of file
diff --git a/fairseq/modules/segmentation/segment_dict.py b/fairseq/modules/segmentation/segment_dict.py
index b9bcad4ca8..ed3a6fe157 100644
--- a/fairseq/modules/segmentation/segment_dict.py
+++ b/fairseq/modules/segmentation/segment_dict.py
@@ -82,9 +82,9 @@ def getSegmentSums(self, segment):
         return (linearSum, squaresSum)
 
     @staticmethod
-    def getFinalSegments(merges, inputShape):
+    def getFinalSegments(merges, shape, padMask=None):  # shape needs to be B x W, without height!
 
-        visited = np.zeros(inputShape, dt=np.int32)
+        visited = np.zeros(shape, dtype=np.int32)
         finalSegments = []
         for i in range(len(merges)-1,-1,-1):
             leftSegm, rightSegm = merges[i]
@@ -95,8 +95,14 @@ def getFinalSegments(merges, inputShape):
             finalSegments.append((line, beginLeft, endRight))
             visited[line][beginLeft:(endRight+1)] = 1
 
+        # add length-1 segments that are there not padded but were not a part of any merge
+        for i in range(visited.shape[0]):
+            for j in range(visited.shape[1]):
+                if not visited[i][j] and (padMask is None or not padMask[i][j]):
+                    finalSegments.append((i, j, j))
+
         lineCounter = 0
-        prevLine = 1
+        prevLine = 0  # don't append useless 0 at the beginning
         res = {}  # {(line, #ofSegmentInLine): (line, beginIdx, endIdx)}
         segmentsInLines = []  # numbers of segments in lines
         for line, begin, end in sorted(finalSegments):

From 4a5afef7ec4af69f206668a6c70b0d09e9e925fb Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Wed, 2 Dec 2020 16:24:53 +0100
Subject: [PATCH 07/22] removed debug printout from parts that are to be used
 outside debugging & testing

---
 fairseq/modules/__init__.py                   |  3 +-
 .../hierarchical_variance_segmentation.py     | 31 +++++++------------
 2 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/fairseq/modules/__init__.py b/fairseq/modules/__init__.py
index 9991681542..f39839f066 100644
--- a/fairseq/modules/__init__.py
+++ b/fairseq/modules/__init__.py
@@ -73,6 +73,7 @@
     "TransformerEncoderLayer",
     "TransposeLast",
     "VGGBlock",
-    'hierarchicalVarianceSegmentation',
+    "hierarchicalVarianceSegmentation",
+    "HierarchicalVarianceSegmentationLayer",
     "unfold1d",
 ]
diff --git a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
index 59d6009c23..b8ad876741 100644
--- a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
+++ b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
@@ -97,13 +97,13 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None):  # k for
         # https://discuss.pytorch.org/t/what-is-the-cpu-in-pytorch/15007/3
 
         varChanges, merges = hierarchicalVarianceSegmentation(input, padMask=padMask, k=k)  # won't modify input
-        print("MERGES0: ", merges)
+        #print("MERGES0: ", merges)
         if allowKsumRange:  # full merge done above, k=None
             begin, end = allowKsumRange
             assert begin <= end
             beginIdx = input.shape[0] - 1 + len(varChanges) - end  # max allowed num of segments, smallest num of merges; input.shape[0] is num of segments if all merges done
             endIdx = input.shape[0] - 1 + len(varChanges) - begin  # min allowed num of segments, biggest num of merges; input.shape[0] is num of segments if all merges done
-            print("::::::::::", beginIdx, endIdx)
+            #print("::::::::::", beginIdx, endIdx)
             prefSums = []
             s = 0.
             for chng in varChanges:
@@ -111,26 +111,26 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None):  # k for
                 prefSums.append(s)
             best = -1
             where = -1
-            print("PREFSUMS: ", prefSums)
+            #print("PREFSUMS: ", prefSums)
             for i in range(beginIdx, min(endIdx+1, len(varChanges))):
                 sufSum = s - prefSums[i]  # sum after this index
                 prefSum = prefSums[i] if prefSums[i] > 0. else .0000001  # don't div by 0
                 # v the bigger the better split point; suffix div by prefix averages of variance change
                 here = (sufSum / (len(varChanges)-i))  /  (prefSum / (i+1.))  
-                print("!", i, ":", prefSum ,sufSum, here)
+                #print("!", i, ":", prefSum ,sufSum, here)
                 
                 if here > best:
                     best = here
                     where = i
             if where == -1:
-                print("WARNING: problems choosing best num segments")
+                #print("WARNING: problems choosing best num segments")
                 where = int((beginIdx + endIdx) // 2)
             varChanges = varChanges[:where+1]  # this one is not really needed
             merges = merges[:where+1]
             
         finalSegments, segmentNumsInLines = SegmentDict.getFinalSegments(merges, input.shape[:2], padMask=padMask)
-        print("MERGES: ", merges)
-        print("FINAL SEGMENTS: ", finalSegments)
+        #print("MERGES: ", merges)
+        #print("FINAL SEGMENTS: ", finalSegments)
 
         maxSegments = max(segmentNumsInLines)
         paddingMaskOut = np.full((input.shape[0], maxSegments), False)  #torch.BoolTensor(size=(input.shape[0], maxSegments)).fill_(False)
@@ -145,7 +145,7 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None):  # k for
         resOutput = torch.tensor(segmented).to('cuda') if wasInputOnGPU else torch.tensor(segmented)  #.requires_grad_(True)
         resPadMask = torch.BoolTensor(paddingMaskOut).to('cuda') if wasPadMaskOnGPU else torch.BoolTensor(paddingMaskOut)
 
-        print("********************", dir(ctx))
+        #print("********************", dir(ctx))
         ctx.save_for_backward(padMask, resPadMask)
         # save_for_backward is only for tensors / variables / stuff
         ctx.finalSegments = finalSegments
@@ -153,28 +153,19 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None):  # k for
         ctx.inputShape = input.shape
         ctx.mark_non_differentiable(resPadMask)  # can only pass torch variables here and only that makes sense
 
-        print("FINAL SEGMENTS: ", finalSegments, segmentNumsInLines)
+        #print("FINAL SEGMENTS: ", finalSegments, segmentNumsInLines)
         return resOutput, resPadMask  #, finalSegments, segmentNumsInLines can only return torch variables... TODO maybe check how to fetch this info, but not sure if needed
 
     @staticmethod
     def backward(ctx, dxThrough, outPadMask=None):  #, finalSegments=None, segmentNumsInLines=None):
 
-        print("AAAA")
-
         paddingMask, paddingMaskOut = ctx.saved_tensors
-
-        print("BBBB")
-
         dx = torch.empty(size=ctx.inputShape).fill_(0.)
 
-        print("BBBB2")
-
         for line, idxInLine in ctx.finalSegments.keys():
             line, begin, end = ctx.finalSegments[(line, idxInLine)]
             dx[line][begin:(end+1)] = dxThrough[line][idxInLine] / (end - begin + 1)
 
-        print("CCCC")
-
         return dx, None, None, None
 
 
@@ -188,14 +179,14 @@ def backward(ctx, dxThrough, outPadMask=None):  #, finalSegments=None, segmentNu
 
     import torch
 
-    tensor = torch.tensor([[[1,2],[1,2],[3,4],[3,4],[3,4],[8,9]], [[1,2],[1,2],[3,4],[3,4],[3,4],[8,9]]], dtype=torch.float64).requires_grad_(True)
+    tensor = torch.tensor([[[1,2],[1,2],[3,4],[3,4],[3,4],[8,9],[8,9]], [[1,2],[1,2],[3,4],[3,4],[3,4],[8,9],[8,9]]], dtype=torch.float64).requires_grad_(True)
     print(tensor[0][1])
     print(hierarchicalVarianceSegmentation(tensor.detach().numpy(), padMask=None, k=2))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
     print(hierarchicalVarianceSegmentation(tensor.detach().numpy(), padMask=None, k=1))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
 
     print("-------------------------- torch ---------------------------")
     # (tensor, padMask, k, kSumRange)
-    resOutput, resPadMask = HierarchicalVarianceSegmentationLayer.apply(tensor, None, None, (2,5))  #(2, 5))  # can;t have keyword args for torch Functions...
+    resOutput, resPadMask = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), None, (2,5))  #(2, 5))  # can;t have keyword args for torch Functions...
     print(resOutput)
     print(resPadMask)
     #print(finalSegments)

From 2bf242a0d2be88ccf969b337d90d475c422492f8 Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Wed, 2 Dec 2020 17:01:48 +0100
Subject: [PATCH 08/22] torch tensor device fixes in forward/backward + num of
 segments taken as sum for whole batch

---
 .../models/wav2vec/wav2vec2_scribblelens.py   |  2 ++
 .../hierarchical_variance_segmentation.py     | 33 ++++++++++++++-----
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/fairseq/models/wav2vec/wav2vec2_scribblelens.py b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
index 88c3798462..a7207ec523 100644
--- a/fairseq/models/wav2vec/wav2vec2_scribblelens.py
+++ b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
@@ -552,6 +552,8 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False):
             padding_mask = padding_mask[:, ::scale]
             assert np.all(padding_mask.shape == features.shape[:-1])
 
+        # TODO add here a function for segmentation (+command line params) and update mask
+
         if self.post_extract_proj is not None:
             features = self.post_extract_proj(features)
 
diff --git a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
index b8ad876741..bb627020ca 100644
--- a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
+++ b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
@@ -9,7 +9,7 @@ def variance(linearSum, squaresSum, size):
 
 
 # [!] lines has to be a numpy array, np.sum() crashes if done on tensor
-def hierarchicalVarianceSegmentation(lines, padMask=None, k=None):  # k is per line (total num of segments to be made is k*numLines)
+def hierarchicalVarianceSegmentation(lines, padMask=None, k=None):  # k is sum of number of segments for all lines
     
    # TODO check if tensor parts correctly taken etc. [!]
     segmentsDict = SegmentDict(lines, padMask=padMask)
@@ -35,7 +35,7 @@ def hierarchicalVarianceSegmentation(lines, padMask=None, k=None):  # k is per l
     varChanges = []
     merges = []
     
-    while len(q) and (k is None or segmentsDict.numSegments() > k * lines.shape[0]):
+    while len(q) and (k is None or segmentsDict.numSegments() > k):
     
         varChange, left, right = heappop(q)
         merged = segmentsDict.mergeSegments(left, right)  # checks if merge is valid
@@ -82,14 +82,15 @@ def flatten(x):
     # perhaps that ^ is not needed, and restore_shapes also
 
     @staticmethod
-    def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None):  # k for strict num of segments, allowKsumRange for range OF SUM OF SEGMENTS IN ALL LINES and choosing 'best' split point
+    def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None): 
+    # k for strict num of segments (SUM FOR ALL LINES), allowKsumRange for range OF SUM OF SEGMENTS IN ALL LINES and choosing 'best' split point
 
         assert k is None or allowKsumRange is None  # mutually exclusive options
 
         # TODO if input only 2-dim, add another dimension possibly (W x H -> 1 x W x H, consistent with B x W x H - later assuming that in some places)
 
-        wasInputOnGPU = inputGPU.is_cuda
-        wasPadMaskOnGPU = padMask.is_cuda if padMask is not None else False
+        inputDevice = inputGPU.device
+        padMaskInputDevice = padMask.device if padMask is not None else False
 
         # tensor to CPU  (don't really need copy, will just need to put tensors in segmentsDict)
         input = inputGPU.detach().to('cpu').numpy()  
@@ -142,8 +143,8 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None):  # k for
             line, begin, end = finalSegments[(line, idxInLine)]
             segmented[line][idxInLine] = np.mean(input[line][begin:(end+1)], axis=0)  #torch.mean(input[line][begin:(end+1)])
 
-        resOutput = torch.tensor(segmented).to('cuda') if wasInputOnGPU else torch.tensor(segmented)  #.requires_grad_(True)
-        resPadMask = torch.BoolTensor(paddingMaskOut).to('cuda') if wasPadMaskOnGPU else torch.BoolTensor(paddingMaskOut)
+        resOutput = torch.tensor(segmented).to(inputDevice)   #if wasInputOnGPU else torch.tensor(segmented)  #.requires_grad_(True)
+        resPadMask = torch.BoolTensor(paddingMaskOut).to(padMaskInputDevice)   #if wasPadMaskOnGPU else torch.BoolTensor(paddingMaskOut)
 
         #print("********************", dir(ctx))
         ctx.save_for_backward(padMask, resPadMask)
@@ -159,13 +160,17 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None):  # k for
     @staticmethod
     def backward(ctx, dxThrough, outPadMask=None):  #, finalSegments=None, segmentNumsInLines=None):
 
+        dxThroughDevice = dxThrough.device
+
         paddingMask, paddingMaskOut = ctx.saved_tensors
-        dx = torch.empty(size=ctx.inputShape).fill_(0.)
+        dx = torch.empty(size=ctx.inputShape).fill_(0.).to('cpu')
 
         for line, idxInLine in ctx.finalSegments.keys():
             line, begin, end = ctx.finalSegments[(line, idxInLine)]
             dx[line][begin:(end+1)] = dxThrough[line][idxInLine] / (end - begin + 1)
 
+        dx = dx.to(dxThroughDevice)
+
         return dx, None, None, None
 
 
@@ -193,4 +198,16 @@ def backward(ctx, dxThrough, outPadMask=None):  #, finalSegments=None, segmentNu
     #print(segmentNumsInLines)
     #loss = Variable(resOutput, requires_grad=True)
     resOutput.sum().backward()  # .backward() needs loss to be a number (tensor of size (1,))
+    print(tensor.grad)
+
+    print("-------------------------- torch2 ---------------------------")
+    # (tensor, padMask, k, kSumRange)
+    tensor.grad.data.zero_()
+    resOutput, resPadMask = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None)  #(2, 5))  # can;t have keyword args for torch Functions...
+    print(resOutput)
+    print(resPadMask)
+    #print(finalSegments)
+    #print(segmentNumsInLines)
+    #loss = Variable(resOutput, requires_grad=True)
+    resOutput.sum().backward()  # .backward() needs loss to be a number (tensor of size (1,))
     print(tensor.grad)
\ No newline at end of file

From 8f274757b5c6ebc5e8d5a28db8f394d5faefa5c3 Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Wed, 2 Dec 2020 21:09:49 +0100
Subject: [PATCH 09/22] using segmentation in wav2vec2_scribblelens as an
 option, but getting errors because of fairseq data utils implicit assumptions

---
 fairseq/data/data_utils.py                    |  3 ++-
 .../models/wav2vec/wav2vec2_scribblelens.py   | 26 ++++++++++++++++++-
 fairseq/modules/__init__.py                   |  2 +-
 .../hierarchical_variance_segmentation.py     |  9 +++----
 uwr_related/test_cmd_scribble.sh              |  1 +
 5 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
index 81f457365a..4cb17cf321 100644
--- a/fairseq/data/data_utils.py
+++ b/fairseq/data/data_utils.py
@@ -471,7 +471,8 @@ def arrange(s, e, length, keep_length):
 
     min_len = min([len(m) for m in mask_idcs])
     for i, mask_idc in enumerate(mask_idcs):
-        if len(mask_idc) > min_len:
+        if len(mask_idc) > min_len:  # TODO this check is incorrect and can lead to 0 masked elements with minimum masked set to > 0
+            # TODO 2 choosing indices to mask also seems incorrect, once got 0 chosen with minimum=2 (though there was only 1 unmasked in that line)
             mask_idc = np.random.choice(mask_idc, min_len, replace=False)
         mask[i, mask_idc] = True
 
diff --git a/fairseq/models/wav2vec/wav2vec2_scribblelens.py b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
index a7207ec523..9f1bc85ec8 100644
--- a/fairseq/models/wav2vec/wav2vec2_scribblelens.py
+++ b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
@@ -25,6 +25,7 @@
     MultiheadAttention,
     SamePad,
     TransposeLast,
+    HierarchicalVarianceSegmentationLayer,
 )
 from fairseq.modules.transformer_sentence_encoder import init_bert_params
 from fairseq.utils import buffered_arange
@@ -298,6 +299,10 @@ def add_args(parser):
             "--conv-bias", action="store_true", help="include bias in conv encoder"
         )
 
+        parser.add_argument(
+            "--segm", type=str, help="use segmentation on representations; 'var' (without ') for variance-based hierarchical segm"
+        )
+
     def __init__(self, args):
         super().__init__()
         self.args = args
@@ -398,6 +403,11 @@ def __init__(self, args):
 
         self.final_proj = nn.Linear(args.encoder_embed_dim, final_dim)
 
+        if 'segm' in args:
+            self.segm = args.segm
+        else:
+            self.segm = None
+
     def upgrade_state_dict_named(self, state_dict, name):
         super().upgrade_state_dict_named(state_dict, name)
         """Upgrade a (possibly old) state dict for new versions of fairseq."""
@@ -541,7 +551,7 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False):
 
         features = features.transpose(1, 2)
         features = self.layer_norm(features)
-        unmasked_features = features.clone()
+        # unmasked_features = features.clone() needed to move after segmentation
 
         if padding_mask is not None:
             assert padding_mask.size(1) == 1
@@ -553,7 +563,13 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False):
             assert np.all(padding_mask.shape == features.shape[:-1])
 
         # TODO add here a function for segmentation (+command line params) and update mask
+        if self.segm:
+            features, padding_mask = self.segmentation(features, padding_mask)
+
+        unmasked_features = features.clone()
 
+        # doing it here as needed to clone features after segmentation and clone was before post_extract_proj 
+        # - [!] TODO check if this (cloning before post_extract_proj) is intended, looks very weird to me
         if self.post_extract_proj is not None:
             features = self.post_extract_proj(features)
 
@@ -650,6 +666,14 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False):
 
         return result
 
+    def segmentation(self, features, padding_mask):
+        assert self.segm == 'var'  # for now only that supported, to be extended
+        non_padded = padding_mask.numel() - padding_mask.sum().item()
+        base_len_sum = non_padded / 3
+        min_segm = max(features.shape[0], int(round(0.85*base_len_sum)))
+        max_segm = min(non_padded, int(round(1.15*base_len_sum)))
+        return HierarchicalVarianceSegmentationLayer.apply(features, padding_mask, None, (min_segm, max_segm))
+
     def quantize(self, x):
         assert self.quantizer is not None
         x = self.feature_extractor(x)
diff --git a/fairseq/modules/__init__.py b/fairseq/modules/__init__.py
index f39839f066..91fcbd3653 100644
--- a/fairseq/modules/__init__.py
+++ b/fairseq/modules/__init__.py
@@ -35,7 +35,7 @@
 from .unfold import unfold1d
 from .transformer_layer import TransformerDecoderLayer, TransformerEncoderLayer
 from .vggblock import VGGBlock
-from .segmentation import hierarchicalVarianceSegmentation
+from .segmentation.hierarchical_variance_segmentation import hierarchicalVarianceSegmentation, HierarchicalVarianceSegmentationLayer
 
 __all__ = [
     "AdaptiveInput",
diff --git a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
index bb627020ca..56eb340938 100644
--- a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
+++ b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
@@ -1,4 +1,5 @@
 
+import torch
 import numpy as np
 from .segment_dict import *
 from heapq import *
@@ -143,7 +144,7 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None):
             line, begin, end = finalSegments[(line, idxInLine)]
             segmented[line][idxInLine] = np.mean(input[line][begin:(end+1)], axis=0)  #torch.mean(input[line][begin:(end+1)])
 
-        resOutput = torch.tensor(segmented).to(inputDevice)   #if wasInputOnGPU else torch.tensor(segmented)  #.requires_grad_(True)
+        resOutput = torch.tensor(segmented, dtype=inputGPU.dtype).to(inputDevice)   #if wasInputOnGPU else torch.tensor(segmented)  #.requires_grad_(True)
         resPadMask = torch.BoolTensor(paddingMaskOut).to(padMaskInputDevice)   #if wasPadMaskOnGPU else torch.BoolTensor(paddingMaskOut)
 
         #print("********************", dir(ctx))
@@ -163,7 +164,7 @@ def backward(ctx, dxThrough, outPadMask=None):  #, finalSegments=None, segmentNu
         dxThroughDevice = dxThrough.device
 
         paddingMask, paddingMaskOut = ctx.saved_tensors
-        dx = torch.empty(size=ctx.inputShape).fill_(0.).to('cpu')
+        dx = torch.empty(size=ctx.inputShape, dtype=dxThrough.dtype).fill_(0.).to('cpu')
 
         for line, idxInLine in ctx.finalSegments.keys():
             line, begin, end = ctx.finalSegments[(line, idxInLine)]
@@ -182,12 +183,10 @@ def backward(ctx, dxThrough, outPadMask=None):  #, finalSegments=None, segmentNu
 
     # run from .. with python -m segmentation.hierarchical_variance_segmentation
 
-    import torch
-
     tensor = torch.tensor([[[1,2],[1,2],[3,4],[3,4],[3,4],[8,9],[8,9]], [[1,2],[1,2],[3,4],[3,4],[3,4],[8,9],[8,9]]], dtype=torch.float64).requires_grad_(True)
     print(tensor[0][1])
+    print(hierarchicalVarianceSegmentation(tensor.detach().numpy(), padMask=None, k=4))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
     print(hierarchicalVarianceSegmentation(tensor.detach().numpy(), padMask=None, k=2))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
-    print(hierarchicalVarianceSegmentation(tensor.detach().numpy(), padMask=None, k=1))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
 
     print("-------------------------- torch ---------------------------")
     # (tensor, padMask, k, kSumRange)
diff --git a/uwr_related/test_cmd_scribble.sh b/uwr_related/test_cmd_scribble.sh
index 0fd001454a..cce3fa6d0b 100755
--- a/uwr_related/test_cmd_scribble.sh
+++ b/uwr_related/test_cmd_scribble.sh
@@ -57,4 +57,5 @@ python train.py --distributed-world-size 1 --update-freq 2 \
   --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 --max-tokens 10000 --max-update 400000 \
   --skip-invalid-size-inputs-valid-test --ddp-backend no_c10d \
   --labels `#can be removed for no labels` \
+  `#--segm var  # optional segmentation` \ 
   --enable-padding # crashes without that, needs to make all lines same-size
\ No newline at end of file

From b8ded9cfdf78f0a543bf0d0e26d573516bc3e9e5 Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Thu, 3 Dec 2020 18:42:19 +0100
Subject: [PATCH 10/22] added minimum number of segments per line so that
 masking at least 2 does not crash on too short lines

---
 fairseq/data/data_utils.py                    |  7 +--
 .../models/wav2vec/wav2vec2_scribblelens.py   |  7 +--
 .../hierarchical_variance_segmentation.py     | 45 ++++++++++++-------
 fairseq/modules/segmentation/segment_dict.py  | 14 +++++-
 4 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
index 4cb17cf321..121155f373 100644
--- a/fairseq/data/data_utils.py
+++ b/fairseq/data/data_utils.py
@@ -469,10 +469,11 @@ def arrange(s, e, length, keep_length):
 
         mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
 
-    min_len = min([len(m) for m in mask_idcs])
+    min_len = min([len(m) for m in mask_idcs])  
+    # [!] input sequence outside padding has to have appropriate length for this to work correctly 
+    # (e.g. length 1 with min_mask=2 can cause problems)
     for i, mask_idc in enumerate(mask_idcs):
-        if len(mask_idc) > min_len:  # TODO this check is incorrect and can lead to 0 masked elements with minimum masked set to > 0
-            # TODO 2 choosing indices to mask also seems incorrect, once got 0 chosen with minimum=2 (though there was only 1 unmasked in that line)
+        if len(mask_idc) > min_len:  # they want same number of masked stuff per line as a simplification
             mask_idc = np.random.choice(mask_idc, min_len, replace=False)
         mask[i, mask_idc] = True
 
diff --git a/fairseq/models/wav2vec/wav2vec2_scribblelens.py b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
index 9f1bc85ec8..f782937df9 100644
--- a/fairseq/models/wav2vec/wav2vec2_scribblelens.py
+++ b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
@@ -564,7 +564,8 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False):
 
         # TODO add here a function for segmentation (+command line params) and update mask
         if self.segm:
-            features, padding_mask = self.segmentation(features, padding_mask)
+            features, padding_mask = self.segmentation(features, padding_mask, 5)  
+            # [!] minSegmsPerLine needs to be at least a few so that part with masking with at least 2 masks works correctly
 
         unmasked_features = features.clone()
 
@@ -666,13 +667,13 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False):
 
         return result
 
-    def segmentation(self, features, padding_mask):
+    def segmentation(self, features, padding_mask, minSegmsPerLine):
         assert self.segm == 'var'  # for now only that supported, to be extended
         non_padded = padding_mask.numel() - padding_mask.sum().item()
         base_len_sum = non_padded / 3
         min_segm = max(features.shape[0], int(round(0.85*base_len_sum)))
         max_segm = min(non_padded, int(round(1.15*base_len_sum)))
-        return HierarchicalVarianceSegmentationLayer.apply(features, padding_mask, None, (min_segm, max_segm))
+        return HierarchicalVarianceSegmentationLayer.apply(features, padding_mask, None, (min_segm, max_segm), minSegmsPerLine)
 
     def quantize(self, x):
         assert self.quantizer is not None
diff --git a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
index 56eb340938..6ae37897d3 100644
--- a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
+++ b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
@@ -10,10 +10,10 @@ def variance(linearSum, squaresSum, size):
 
 
 # [!] lines has to be a numpy array, np.sum() crashes if done on tensor
-def hierarchicalVarianceSegmentation(lines, padMask=None, k=None):  # k is sum of number of segments for all lines
+def hierarchicalVarianceSegmentation(lines, padMask=None, k=None, minSegmsPerLine=None):  # k is sum of number of segments for all lines
     
    # TODO check if tensor parts correctly taken etc. [!]
-    segmentsDict = SegmentDict(lines, padMask=padMask)
+    segmentsDict = SegmentDict(lines, padMask=padMask, minSegmsPerLine=minSegmsPerLine)
     
     # maybe will need to change this to arrays or so instead of dicts for efficiency
     
@@ -36,12 +36,12 @@ def hierarchicalVarianceSegmentation(lines, padMask=None, k=None):  # k is sum o
     varChanges = []
     merges = []
     
-    while len(q) and (k is None or segmentsDict.numSegments() > k):
+    while len(q) and (k is None or segmentsDict.numSegments() > k):  # will stop merging before k reached if minSegmsPerLine reached
     
         varChange, left, right = heappop(q)
         merged = segmentsDict.mergeSegments(left, right)  # checks if merge is valid
         
-        if merged is None:  # old merge possibility, now impossible
+        if merged is None:  # old merge possibility, now impossible (or minSegmsPerLine reached for this line)
             continue
         
         varChanges.append(varChange)
@@ -67,7 +67,7 @@ def hierarchicalVarianceSegmentation(lines, padMask=None, k=None):  # k is sum o
             mergedVariance = variance(linSumMerged + linSum2, sqSumMerged + sqSum2, right2 - leftMerged + 1)
             heappush(q, (mergedVariance - varMerged - oldVar2, merged, toRight))
             
-    return varChanges, merges
+    return varChanges, merges, segmentsDict
 
 class HierarchicalVarianceSegmentationLayer(Function):
 
@@ -83,8 +83,9 @@ def flatten(x):
     # perhaps that ^ is not needed, and restore_shapes also
 
     @staticmethod
-    def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None): 
+    def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPerLine=None): 
     # k for strict num of segments (SUM FOR ALL LINES), allowKsumRange for range OF SUM OF SEGMENTS IN ALL LINES and choosing 'best' split point
+    # min and max number of merges adjusted to what is possible - e.g. because of minSegmsPerLine
 
         assert k is None or allowKsumRange is None  # mutually exclusive options
 
@@ -98,13 +99,14 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None):
         # https://discuss.pytorch.org/t/cant-convert-cuda-tensor-to-numpy-use-tensor-cpu-to-copy-the-tensor-to-host-memory-first/38301 ,
         # https://discuss.pytorch.org/t/what-is-the-cpu-in-pytorch/15007/3
 
-        varChanges, merges = hierarchicalVarianceSegmentation(input, padMask=padMask, k=k)  # won't modify input
+        varChanges, merges, segmentsDict = hierarchicalVarianceSegmentation(input, padMask=padMask, k=k, minSegmsPerLine=minSegmsPerLine)  # won't modify input
         #print("MERGES0: ", merges)
-        if allowKsumRange:  # full merge done above, k=None
+        if allowKsumRange:  # full merge done above, k=None, so each line now has minSegmsPerLine, but can also just get it from SegmDict - cleaner
             begin, end = allowKsumRange
             assert begin <= end
-            beginIdx = input.shape[0] - 1 + len(varChanges) - end  # max allowed num of segments, smallest num of merges; input.shape[0] is num of segments if all merges done
-            endIdx = input.shape[0] - 1 + len(varChanges) - begin  # min allowed num of segments, biggest num of merges; input.shape[0] is num of segments if all merges done
+            # [!] min and max number of merges adjusted to what is possible - e.g. because of minSegmsPerLine
+            beginIdx = max(0, min(len(varChanges) - 1, (segmentsDict.numSegments() + (len(varChanges) - 1) - end)))  # max allowed num of segments, smallest num of merges; input.shape[0] is num of segments if all merges done
+            endIdx = max(0, min(len(varChanges) - 1, (segmentsDict.numSegments() + (len(varChanges) - 1) - begin)))  # min allowed num of segments, biggest num of merges; input.shape[0] is num of segments if all merges done
             #print("::::::::::", beginIdx, endIdx)
             prefSums = []
             s = 0.
@@ -125,7 +127,7 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None):
                     best = here
                     where = i
             if where == -1:
-                #print("WARNING: problems choosing best num segments")
+                print("WARNING: problems choosing best num segments")
                 where = int((beginIdx + endIdx) // 2)
             varChanges = varChanges[:where+1]  # this one is not really needed
             merges = merges[:where+1]
@@ -172,7 +174,7 @@ def backward(ctx, dxThrough, outPadMask=None):  #, finalSegments=None, segmentNu
 
         dx = dx.to(dxThroughDevice)
 
-        return dx, None, None, None
+        return dx, None, None, None, None
 
 
 if __name__ == '__main__':
@@ -185,12 +187,12 @@ def backward(ctx, dxThrough, outPadMask=None):  #, finalSegments=None, segmentNu
 
     tensor = torch.tensor([[[1,2],[1,2],[3,4],[3,4],[3,4],[8,9],[8,9]], [[1,2],[1,2],[3,4],[3,4],[3,4],[8,9],[8,9]]], dtype=torch.float64).requires_grad_(True)
     print(tensor[0][1])
-    print(hierarchicalVarianceSegmentation(tensor.detach().numpy(), padMask=None, k=4))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
-    print(hierarchicalVarianceSegmentation(tensor.detach().numpy(), padMask=None, k=2))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
+    print(hierarchicalVarianceSegmentation(tensor.detach().numpy(), padMask=None, k=4, minSegmsPerLine=None))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
+    print(hierarchicalVarianceSegmentation(tensor.detach().numpy(), padMask=None, k=2, minSegmsPerLine=None))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
 
     print("-------------------------- torch ---------------------------")
     # (tensor, padMask, k, kSumRange)
-    resOutput, resPadMask = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), None, (2,5))  #(2, 5))  # can;t have keyword args for torch Functions...
+    resOutput, resPadMask = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), None, (2,5), None)  #(2, 5))  # can;t have keyword args for torch Functions...
     print(resOutput)
     print(resPadMask)
     #print(finalSegments)
@@ -202,11 +204,22 @@ def backward(ctx, dxThrough, outPadMask=None):  #, finalSegments=None, segmentNu
     print("-------------------------- torch2 ---------------------------")
     # (tensor, padMask, k, kSumRange)
     tensor.grad.data.zero_()
-    resOutput, resPadMask = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None)  #(2, 5))  # can;t have keyword args for torch Functions...
+    resOutput, resPadMask = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, None)  #(2, 5))  # can;t have keyword args for torch Functions...
     print(resOutput)
     print(resPadMask)
     #print(finalSegments)
     #print(segmentNumsInLines)
     #loss = Variable(resOutput, requires_grad=True)
+    resOutput.sum().backward()  # .backward() needs loss to be a number (tensor of size (1,))
+    print(tensor.grad)
+
+    print("-------------------------- torch3 ---------------------------")
+    # (tensor, padMask, k, kSumRange)
+    tensor.grad.data.zero_()
+    resOutput, resPadMask = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, 2)  #(2, 5))  # can;t have keyword args for torch Functions...
+    print(resOutput)
+    print(resPadMask)
+    # [!] here will return 4 segments instead of specified 3, because of specified minSegmsPerLine
+
     resOutput.sum().backward()  # .backward() needs loss to be a number (tensor of size (1,))
     print(tensor.grad)
\ No newline at end of file
diff --git a/fairseq/modules/segmentation/segment_dict.py b/fairseq/modules/segmentation/segment_dict.py
index ed3a6fe157..5794ecb58e 100644
--- a/fairseq/modules/segmentation/segment_dict.py
+++ b/fairseq/modules/segmentation/segment_dict.py
@@ -3,10 +3,16 @@
 
 class SegmentDict:
     
-    def __init__(self, lines, padMask=None):  # lines assumed to be of shape [#lines x line_len * k]
+    def __init__(self, lines, padMask=None, minSegmsPerLine=None):  # lines assumed to be of shape [#lines x line_len * k]
         # (line#, place in line): (begin in line, end in line, sum(x), sum(x^2)) ; sums are possibly vectors
         self._dct = {(i, j): (j, j, lines[i][j], np.square(lines[i][j])) for i in range(len(lines)) for j in range(len(lines[i])) if padMask is None or not padMask[i][j]}
         self._size = len(self._dct)  # sometimes 1 (now all segments have 1 entry), sometimes later 2 entries per segment - better keep a counter
+
+        self._line_segms = [0 for i in range(lines.shape[0])]
+        for line, _ in self._dct:
+            self._line_segms[line] += 1
+
+        self.minSegmsPerLine = minSegmsPerLine
         
     # there is a 'segment' implicit format (tuple) used: (line#, leftIndex(begin), rightIndex(end))
         
@@ -31,12 +37,15 @@ def removeSegment(self, segment):
             wasThere = True
         if wasThere:
             self._size -= 1
+            self._line_segms[line] -= 1
             
     def mergeSegments(self, segment1, segment2):
         line1, left1, right1 = segment1
         line2, left2, right2 = segment2
         if not self.segmentInDict(segment1) or not self.segmentInDict(segment2) \
-           or line1 != line2 or right1 + 1 != left2:  # not subsequent
+           or line1 != line2 or right1 + 1 != left2 \
+           or (self.minSegmsPerLine and self._line_segms[line1] <= self.minSegmsPerLine):  
+           # not subsequent or too few segments in line
             return None
         linearSum1, squaresSum1 = self.getSegmentSums(segment1)
         linearSum2, squaresSum2 = self.getSegmentSums(segment2)
@@ -47,6 +56,7 @@ def mergeSegments(self, segment1, segment2):
         self._dct[(line1, left1)] = (left1, right2, linearSum1 + linearSum2, squaresSum1 + squaresSum2)
         self._dct[(line1, right2)] = (left1, right2, linearSum1 + linearSum2, squaresSum1 + squaresSum2)
         self._size += 1
+        self._line_segms[line1] += 1
         return (line1, left1, right2)
             
     def getSegments(self):

From de752f3bec57805240cdeaa8ff141db53274a5e6 Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Fri, 4 Dec 2020 14:32:50 +0100
Subject: [PATCH 11/22] SegmentDict fix

---
 fairseq/modules/segmentation/segment_dict.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/fairseq/modules/segmentation/segment_dict.py b/fairseq/modules/segmentation/segment_dict.py
index 5794ecb58e..e4a2acd7d0 100644
--- a/fairseq/modules/segmentation/segment_dict.py
+++ b/fairseq/modules/segmentation/segment_dict.py
@@ -23,8 +23,9 @@ def segmentInDict(self, segment):
         line, leftIdx, rightIdx = segment
         if (line, leftIdx) not in self._dct:
             return False
-        _, rightIdxFromDict, _, _ = self._dct[(line, leftIdx)]
-        return rightIdx == rightIdxFromDict  # left already checked by key
+        leftIdxFromDict, rightIdxFromDict, _, _ = self._dct[(line, leftIdx)]
+        return leftIdx == leftIdxFromDict and rightIdx == rightIdxFromDict  
+        # (line, leftIdx) in dct can be for right-range leftIdx with different leftIdxFromDict for merged segment
         
     def removeSegment(self, segment):
         line, leftIdx, rightIdx = segment
@@ -52,9 +53,14 @@ def mergeSegments(self, segment1, segment2):
         # remove old segments; will update _size
         self.removeSegment(segment1)
         self.removeSegment(segment2)
+        #assert (line1, left1) not in self._dct
+        #assert (line1, right1) not in self._dct
+        #assert (line1, left2) not in self._dct
+        #assert (line1, right2) not in self._dct
         # add a new merged one; need to update _size by hand
         self._dct[(line1, left1)] = (left1, right2, linearSum1 + linearSum2, squaresSum1 + squaresSum2)
         self._dct[(line1, right2)] = (left1, right2, linearSum1 + linearSum2, squaresSum1 + squaresSum2)
+        print(segment1, segment2, "->", (line1, left1, right2))
         self._size += 1
         self._line_segms[line1] += 1
         return (line1, left1, right2)
@@ -73,6 +79,9 @@ def getSegmentLeft(self, segment):
         if (line, left - 1) not in self._dct:
             return None
         segmLeft, segmRight, _, _ = self._dct[(line, left - 1)]
+        print(left, right, "!", segmLeft, segmRight, left - 1)
+        #assert segmRight == left - 1
+        #assert self._dct[(line, segmLeft)][0] == segmLeft and self._dct[(line, segmLeft)][1] == segmRight
         return (line, segmLeft, segmRight)
     
     def getSegmentRight(self, segment):
@@ -82,6 +91,8 @@ def getSegmentRight(self, segment):
         if (line, right + 1) not in self._dct:
             return None
         segmLeft, segmRight, _, _ = self._dct[(line, right + 1)]
+        #assert segmLeft == right + 1
+        #assert self._dct[(line, segmRight)][0] == segmLeft and self._dct[(line, segmRight)][1] == segmRight
         return (line, segmLeft, segmRight)
     
     def getSegmentSums(self, segment):

From fdcc62887f99a79d007ea28d3be6b3a126972c64 Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Fri, 4 Dec 2020 14:36:04 +0100
Subject: [PATCH 12/22] aaand removed printouts

---
 fairseq/modules/segmentation/segment_dict.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fairseq/modules/segmentation/segment_dict.py b/fairseq/modules/segmentation/segment_dict.py
index e4a2acd7d0..2da056b15d 100644
--- a/fairseq/modules/segmentation/segment_dict.py
+++ b/fairseq/modules/segmentation/segment_dict.py
@@ -60,7 +60,7 @@ def mergeSegments(self, segment1, segment2):
         # add a new merged one; need to update _size by hand
         self._dct[(line1, left1)] = (left1, right2, linearSum1 + linearSum2, squaresSum1 + squaresSum2)
         self._dct[(line1, right2)] = (left1, right2, linearSum1 + linearSum2, squaresSum1 + squaresSum2)
-        print(segment1, segment2, "->", (line1, left1, right2))
+        #print(segment1, segment2, "->", (line1, left1, right2))
         self._size += 1
         self._line_segms[line1] += 1
         return (line1, left1, right2)
@@ -79,7 +79,7 @@ def getSegmentLeft(self, segment):
         if (line, left - 1) not in self._dct:
             return None
         segmLeft, segmRight, _, _ = self._dct[(line, left - 1)]
-        print(left, right, "!", segmLeft, segmRight, left - 1)
+        #print(left, right, "!", segmLeft, segmRight, left - 1)
         #assert segmRight == left - 1
         #assert self._dct[(line, segmLeft)][0] == segmLeft and self._dct[(line, segmLeft)][1] == segmRight
         return (line, segmLeft, segmRight)

From 4cc9f3b3f24e92d1fc700f4f855ffb3ea5a4089d Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Tue, 15 Dec 2020 13:36:51 +0100
Subject: [PATCH 13/22] initial basic segmentation image logging

---
 .../models/wav2vec/wav2vec2_scribblelens.py   | 37 +++++++++++++++-
 .../hierarchical_variance_segmentation.py     | 42 ++++++++++++-------
 2 files changed, 62 insertions(+), 17 deletions(-)

diff --git a/fairseq/models/wav2vec/wav2vec2_scribblelens.py b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
index f782937df9..a717e287e9 100644
--- a/fairseq/models/wav2vec/wav2vec2_scribblelens.py
+++ b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
@@ -30,6 +30,9 @@
 from fairseq.modules.transformer_sentence_encoder import init_bert_params
 from fairseq.utils import buffered_arange
 
+import random
+from PIL import Image, ImageDraw
+
 @register_model("wav2vec2_scribblelens")
 class Wav2Vec2ModelSL(BaseFairseqModel):
     @staticmethod
@@ -303,6 +306,14 @@ def add_args(parser):
             "--segm", type=str, help="use segmentation on representations; 'var' (without ') for variance-based hierarchical segm"
         )
 
+        parser.add_argument(
+            "--random-segm-log-dir", type=str, help="where to log randomly chosen segmentation images"
+        )
+
+        parser.add_argument(
+            "--random-segm-log-freq", type=float, help="how frequently (pbb) to log randomly chosen segmentation images"
+        )
+
     def __init__(self, args):
         super().__init__()
         self.args = args
@@ -405,8 +416,16 @@ def __init__(self, args):
 
         if 'segm' in args:
             self.segm = args.segm
+            if 'random_segm_log_dir' in args:
+                self.random_segm_log_dir = args.random_segm_log_dir
+                self.random_segm_log_freq = args.random_segm_log_freq
+            else:
+                self.random_segm_log_dir = None
+                self.random_segm_log_freq = None
         else:
             self.segm = None
+            self.random_segm_log_dir = None
+            self.random_segm_log_freq = None
 
     def upgrade_state_dict_named(self, state_dict, name):
         super().upgrade_state_dict_named(state_dict, name)
@@ -556,17 +575,31 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False):
         if padding_mask is not None:
             assert padding_mask.size(1) == 1
             padding_mask = padding_mask.squeeze(1)
+            scale_float = float(padding_mask.size(1)) / features.size(1) 
             scale = padding_mask.size(1) // features.size(1)
             extra = padding_mask.size(1) % features.size(1) # should be 0 since 1st CNN reduces number of features [scale] times (due to the architecture choice)
             assert extra == 0
             padding_mask = padding_mask[:, ::scale]
             assert np.all(padding_mask.shape == features.shape[:-1])
+        else:
+            scale_float = float(source.size(1)) / features.size(1)
 
         # TODO add here a function for segmentation (+command line params) and update mask
         if self.segm:
-            features, padding_mask = self.segmentation(features, padding_mask, 5)  
+            features, padding_mask, segment_borders = self.segmentation(features, padding_mask, 5)  
             # [!] minSegmsPerLine needs to be at least a few so that part with masking with at least 2 masks works correctly
 
+            if self.random_segm_log_freq is not None and random.random() < self.random_segm_log_freq:
+                img = Image.fromarray(np.array(source[0]*255., dtype=np.int32)).convert('RGB')
+                borders_here = segment_borders[0]
+                draw = ImageDraw.Draw(img)
+                for i in range(len(borders_here)):
+                    if borders_here[i] != 0:
+                        print("!", source[0].shape, i*scale_float)
+                        draw.line([(round(i*scale_float), 0), (i*round(scale_float), 31)], fill='red', width=3)
+                img.save(self.random_segm_log_dir + "/" + str(int(random.random() * 100000)) + ".png")
+            # TODO sample ang plot with added segm lines; will perhaps need to return something non-tensor, how to do it in pytorch?
+
         unmasked_features = features.clone()
 
         # doing it here as needed to clone features after segmentation and clone was before post_extract_proj 
@@ -673,7 +706,7 @@ def segmentation(self, features, padding_mask, minSegmsPerLine):
         base_len_sum = non_padded / 3
         min_segm = max(features.shape[0], int(round(0.85*base_len_sum)))
         max_segm = min(non_padded, int(round(1.15*base_len_sum)))
-        return HierarchicalVarianceSegmentationLayer.apply(features, padding_mask, None, (min_segm, max_segm), minSegmsPerLine)
+        return HierarchicalVarianceSegmentationLayer.apply(features, padding_mask, base_len_sum, None, minSegmsPerLine)
 
     def quantize(self, x):
         assert self.quantizer is not None
diff --git a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
index 6ae37897d3..d51f245afd 100644
--- a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
+++ b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
@@ -8,8 +8,11 @@
 def variance(linearSum, squaresSum, size):
     return np.sum((squaresSum / size) - np.square(linearSum / size))  # sum of "variance vector"
 
+def mse(linearSum, squaresSum, size=None):  # size is ignored, TODO remove from signature
+    return np.sum(squaresSum - np.square(linearSum))  # sum of "mse vector"
 
 # [!] lines has to be a numpy array, np.sum() crashes if done on tensor
+# TODO actually now uses mse, as variance but not scaled with length; maybe add arg for that (?)
 def hierarchicalVarianceSegmentation(lines, padMask=None, k=None, minSegmsPerLine=None):  # k is sum of number of segments for all lines
     
    # TODO check if tensor parts correctly taken etc. [!]
@@ -28,9 +31,9 @@ def hierarchicalVarianceSegmentation(lines, padMask=None, k=None, minSegmsPerLin
             linSum2, sqSum2 = segmentsDict.getSegmentSums(segmRight)
             line1, left1, right1 = segm
             line2, left2, right2 = segmRight
-            oldVar1 = variance(linSum1, sqSum1, right1 - left1 + 1)
-            oldVar2 = variance(linSum2, sqSum2, right2 - left2 + 1)
-            mergedVariance = variance(linSum1 + linSum2, sqSum1 + sqSum2, right2 - left1 + 1)
+            oldVar1 = mse(linSum1, sqSum1, right1 - left1 + 1)
+            oldVar2 = mse(linSum2, sqSum2, right2 - left2 + 1)
+            mergedVariance = mse(linSum1 + linSum2, sqSum1 + sqSum2, right2 - left1 + 1)
             heappush(q, (mergedVariance - oldVar1 - oldVar2, segm, segmRight))
        
     varChanges = []
@@ -51,20 +54,20 @@ def hierarchicalVarianceSegmentation(lines, padMask=None, k=None, minSegmsPerLin
         toRight = segmentsDict.getSegmentRight(merged)
         linSumMerged, sqSumMerged = segmentsDict.getSegmentSums(merged)
         lineMerged, leftMerged, rightMerged = merged
-        varMerged = variance(linSumMerged, sqSumMerged, rightMerged - leftMerged + 1)
+        varMerged = mse(linSumMerged, sqSumMerged, rightMerged - leftMerged + 1)
         
         if toLeft is not None:
             linSum2, sqSum2 = segmentsDict.getSegmentSums(toLeft)
             line2, left2, right2 = toLeft
-            oldVar2 = variance(linSum2, sqSum2, right2 - left2 + 1)
-            mergedVariance = variance(linSumMerged + linSum2, sqSumMerged + sqSum2, rightMerged - left2 + 1)
+            oldVar2 = mse(linSum2, sqSum2, right2 - left2 + 1)
+            mergedVariance = mse(linSumMerged + linSum2, sqSumMerged + sqSum2, rightMerged - left2 + 1)
             heappush(q, (mergedVariance - varMerged - oldVar2, toLeft, merged))
             
         if toRight is not None:
             linSum2, sqSum2 = segmentsDict.getSegmentSums(toRight)
             line2, left2, right2 = toRight
-            oldVar2 = variance(linSum2, sqSum2, right2 - left2 + 1)
-            mergedVariance = variance(linSumMerged + linSum2, sqSumMerged + sqSum2, right2 - leftMerged + 1)
+            oldVar2 = mse(linSum2, sqSum2, right2 - left2 + 1)
+            mergedVariance = mse(linSumMerged + linSum2, sqSumMerged + sqSum2, right2 - leftMerged + 1)
             heappush(q, (mergedVariance - varMerged - oldVar2, merged, toRight))
             
     return varChanges, merges, segmentsDict
@@ -142,15 +145,20 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPe
             paddingMaskOut[i][n:] = True
         
         segmented = np.full((input.shape[0], maxSegments, input.shape[2]), 0.)  #torch.tensor(size=(input.shape[0], maxSegments, input.shape[2])).fill_(0.)
+        # can perhaps return a tensor with 1 at the beginning of the segments, -1 at the end, 0s elsewhere
+        segmentBorders = np.zeros((input.shape[0], input.shape[1]), dtype=np.int8)
         for line, idxInLine in finalSegments.keys():
             line, begin, end = finalSegments[(line, idxInLine)]
             segmented[line][idxInLine] = np.mean(input[line][begin:(end+1)], axis=0)  #torch.mean(input[line][begin:(end+1)])
+            segmentBorders[line][begin] = 1
+            segmentBorders[line][end] = -1  # [!] can be e.g. [...0, 0, -1, -1, ...] with segment of length 1
 
         resOutput = torch.tensor(segmented, dtype=inputGPU.dtype).to(inputDevice)   #if wasInputOnGPU else torch.tensor(segmented)  #.requires_grad_(True)
         resPadMask = torch.BoolTensor(paddingMaskOut).to(padMaskInputDevice)   #if wasPadMaskOnGPU else torch.BoolTensor(paddingMaskOut)
+        segmentBorders = torch.IntTensor(segmentBorders).to(inputDevice)
 
         #print("********************", dir(ctx))
-        ctx.save_for_backward(padMask, resPadMask)
+        #[not really needed] ctx.save_for_backward(padMask, resPadMask)
         # save_for_backward is only for tensors / variables / stuff
         ctx.finalSegments = finalSegments
         ctx.segmentNumsInLines = segmentNumsInLines
@@ -158,14 +166,15 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPe
         ctx.mark_non_differentiable(resPadMask)  # can only pass torch variables here and only that makes sense
 
         #print("FINAL SEGMENTS: ", finalSegments, segmentNumsInLines)
-        return resOutput, resPadMask  #, finalSegments, segmentNumsInLines can only return torch variables... TODO maybe check how to fetch this info, but not sure if needed
+
+        return resOutput, resPadMask, segmentBorders  #, finalSegments, segmentNumsInLines can only return torch variables... TODO maybe check how to fetch this info, but not sure if needed
 
     @staticmethod
-    def backward(ctx, dxThrough, outPadMask=None):  #, finalSegments=None, segmentNumsInLines=None):
+    def backward(ctx, dxThrough, outPadMask=None, segmentBorders=None):  #, finalSegments=None, segmentNumsInLines=None):
 
         dxThroughDevice = dxThrough.device
 
-        paddingMask, paddingMaskOut = ctx.saved_tensors
+        #[not really needed] paddingMask, paddingMaskOut = ctx.saved_tensors
         dx = torch.empty(size=ctx.inputShape, dtype=dxThrough.dtype).fill_(0.).to('cpu')
 
         for line, idxInLine in ctx.finalSegments.keys():
@@ -192,9 +201,10 @@ def backward(ctx, dxThrough, outPadMask=None):  #, finalSegments=None, segmentNu
 
     print("-------------------------- torch ---------------------------")
     # (tensor, padMask, k, kSumRange)
-    resOutput, resPadMask = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), None, (2,5), None)  #(2, 5))  # can;t have keyword args for torch Functions...
+    resOutput, resPadMask, borders = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), None, (2,5), None)  #(2, 5))  # can;t have keyword args for torch Functions...
     print(resOutput)
     print(resPadMask)
+    print(borders)
     #print(finalSegments)
     #print(segmentNumsInLines)
     #loss = Variable(resOutput, requires_grad=True)
@@ -204,9 +214,10 @@ def backward(ctx, dxThrough, outPadMask=None):  #, finalSegments=None, segmentNu
     print("-------------------------- torch2 ---------------------------")
     # (tensor, padMask, k, kSumRange)
     tensor.grad.data.zero_()
-    resOutput, resPadMask = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, None)  #(2, 5))  # can;t have keyword args for torch Functions...
+    resOutput, resPadMask, borders = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, None)  #(2, 5))  # can;t have keyword args for torch Functions...
     print(resOutput)
     print(resPadMask)
+    print(borders)
     #print(finalSegments)
     #print(segmentNumsInLines)
     #loss = Variable(resOutput, requires_grad=True)
@@ -216,9 +227,10 @@ def backward(ctx, dxThrough, outPadMask=None):  #, finalSegments=None, segmentNu
     print("-------------------------- torch3 ---------------------------")
     # (tensor, padMask, k, kSumRange)
     tensor.grad.data.zero_()
-    resOutput, resPadMask = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, 2)  #(2, 5))  # can;t have keyword args for torch Functions...
+    resOutput, resPadMask, borders = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, 2)  #(2, 5))  # can;t have keyword args for torch Functions...
     print(resOutput)
     print(resPadMask)
+    print(borders)
     # [!] here will return 4 segments instead of specified 3, because of specified minSegmsPerLine
 
     resOutput.sum().backward()  # .backward() needs loss to be a number (tensor of size (1,))

From 17f91c82487e37cef3e4a5b4be6fc03e35ddabf3 Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Tue, 15 Dec 2020 17:48:50 +0100
Subject: [PATCH 14/22] better segmented image logging

---
 fairseq/criterions/wav2vec_criterion.py       | 10 ++-
 .../models/wav2vec/wav2vec2_scribblelens.py   | 81 +++++++++++++++----
 fairseq_cli/train.py                          |  2 +
 uwr_related/test_cmd_scribble.sh              |  3 +-
 4 files changed, 76 insertions(+), 20 deletions(-)

diff --git a/fairseq/criterions/wav2vec_criterion.py b/fairseq/criterions/wav2vec_criterion.py
index 6ac7557dcc..3004ae16e7 100644
--- a/fairseq/criterions/wav2vec_criterion.py
+++ b/fairseq/criterions/wav2vec_criterion.py
@@ -14,11 +14,12 @@
 
 @register_criterion("wav2vec")
 class Wav2vecCriterion(FairseqCriterion):
-    def __init__(self, task, infonce=False, loss_weights=None, log_keys=None):
+    def __init__(self, task, infonce=False, loss_weights=None, log_keys=None, pass_metadata=False):
         super().__init__(task)
         self.infonce = infonce
         self.loss_weights = None if loss_weights is None else eval(loss_weights)
         self.log_keys = [] if log_keys is None else eval(log_keys)
+        self.pass_metadata = pass_metadata
 
     @staticmethod
     def add_args(parser):
@@ -30,6 +31,8 @@ def add_args(parser):
                             help='weights for additional loss terms (not first one)')
         parser.add_argument('--log-keys', type=str, default=None,
                             help='output keys to log')
+        parser.add_argument('--pass-metadata', action='store_true',
+                            help='if set, passes sample ids and epoch nr to the model (for model-specific logging of some specific-id examples per epoch etc.)')
         # fmt: on
 
     def forward(self, model, sample, reduce=True, log_pred=False):
@@ -40,7 +43,10 @@ def forward(self, model, sample, reduce=True, log_pred=False):
         2) the sample size, which is used as the denominator for the gradient
         3) logging outputs to display while training
         """
-        net_output = model(**sample["net_input"])
+        if self.pass_metadata:
+            net_output = model(**sample["net_input"], id=sample["id"], epoch=sample["epoch"])
+        else:
+            net_output = model(**sample["net_input"])
         logits = model.get_logits(net_output).float()
         target = model.get_targets(sample, net_output)
 
diff --git a/fairseq/models/wav2vec/wav2vec2_scribblelens.py b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
index a717e287e9..fe1a9616f5 100644
--- a/fairseq/models/wav2vec/wav2vec2_scribblelens.py
+++ b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
@@ -307,13 +307,18 @@ def add_args(parser):
         )
 
         parser.add_argument(
-            "--random-segm-log-dir", type=str, help="where to log randomly chosen segmentation images"
+            "--segm-log-dir", type=str, help="where to log randomly chosen segmentation images; also serves as 'do log' flag"
         )
 
         parser.add_argument(
             "--random-segm-log-freq", type=float, help="how frequently (pbb) to log randomly chosen segmentation images"
         )
 
+        parser.add_argument(
+            "--segm-log-ids", type=str, help="what ids to log, format: <operator>:arg1,<operator>:arg1:arg2,... without spaces, " \
+            + "operator can be =(id) [=:id] or %(X, id) [%:1000:0], meaning exact id or ids of that modulo X"
+        )
+
     def __init__(self, args):
         super().__init__()
         self.args = args
@@ -416,15 +421,32 @@ def __init__(self, args):
 
         if 'segm' in args:
             self.segm = args.segm
-            if 'random_segm_log_dir' in args:
-                self.random_segm_log_dir = args.random_segm_log_dir
-                self.random_segm_log_freq = args.random_segm_log_freq
+            if 'segm_log_dir' in args:
+                self.segm_log_dir = args.segm_log_dir
+                self.random_segm_log_freq = args.random_segm_log_freq if 'random_segm_log_freq' in args else None
+                if 'segm_log_ids' in args:
+                    options = args.segm_log_ids.split(",")
+                    self.segm_log_ids = []
+                    for opt in options:
+                        details = opt.split(':')
+                        if details[0] == "%":
+                            # need to bind details like that, because otherwise details variable will be bound to some random thing
+                            # that was used later and was also named details; 
+                            # imo one of the nastiest things in python, 
+                            # as scope in python is "until end of function" and not "until the end of the function or sth"
+                            self.segm_log_ids.append((lambda details: (lambda x: x % int(details[1]) == int(details[2])))(details))
+                        elif details[0] == '=':
+                            self.segm_log_ids.append((lambda details: (lambda x: x == int(details[1])))(details))
+                        else:
+                            assert False
+                else:
+                    self.segm_log_ids = None
             else:
-                self.random_segm_log_dir = None
+                self.segm_log_dir = None
                 self.random_segm_log_freq = None
         else:
             self.segm = None
-            self.random_segm_log_dir = None
+            self.segm_log_dir = None
             self.random_segm_log_freq = None
 
     def upgrade_state_dict_named(self, state_dict, name):
@@ -554,7 +576,7 @@ def compute_preds(self, x, y, negatives):
 
         return logits
 
-    def forward(self, source, padding_mask=None, mask=True, features_only=False):
+    def forward(self, source, padding_mask=None, mask=True, features_only=False, id=None, epoch=None):
         # padding_mask = None  # JCh: padding_mask prob need to be True where the data is padded. mask=True => data invalid
 
         if self.feature_grad_mult > 0:
@@ -589,16 +611,10 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False):
             features, padding_mask, segment_borders = self.segmentation(features, padding_mask, 5)  
             # [!] minSegmsPerLine needs to be at least a few so that part with masking with at least 2 masks works correctly
 
-            if self.random_segm_log_freq is not None and random.random() < self.random_segm_log_freq:
-                img = Image.fromarray(np.array(source[0]*255., dtype=np.int32)).convert('RGB')
-                borders_here = segment_borders[0]
-                draw = ImageDraw.Draw(img)
-                for i in range(len(borders_here)):
-                    if borders_here[i] != 0:
-                        print("!", source[0].shape, i*scale_float)
-                        draw.line([(round(i*scale_float), 0), (i*round(scale_float), 31)], fill='red', width=3)
-                img.save(self.random_segm_log_dir + "/" + str(int(random.random() * 100000)) + ".png")
-            # TODO sample ang plot with added segm lines; will perhaps need to return something non-tensor, how to do it in pytorch?
+            if self.segm_log_dir:
+                for i in range(source.shape[0]):
+                    self.check_if_and_log_segmented_image(source[i], [int(round(j*scale_float)) for j, k in enumerate(segment_borders[i]) if k.item() != 0], id=id[i] if id is not None else None, epoch=epoch)
+
 
         unmasked_features = features.clone()
 
@@ -708,6 +724,37 @@ def segmentation(self, features, padding_mask, minSegmsPerLine):
         max_segm = min(non_padded, int(round(1.15*base_len_sum)))
         return HierarchicalVarianceSegmentationLayer.apply(features, padding_mask, base_len_sum, None, minSegmsPerLine)
 
+    def log_segmented_image(self, img, borders, name=None, convert_numbers_from_01=True):
+        converted_grayscale_img = img*255. if convert_numbers_from_01 else img
+        img = Image.fromarray(np.array(converted_grayscale_img, dtype=np.int32)).convert('RGB')
+        draw = ImageDraw.Draw(img)
+        for border in borders:
+            #if borders[i] != 0:
+            #print("!", source[0].shape, i*scale_float)
+            draw.line([(border, 0), (border, 31)], fill='red', width=3)
+        save_name = name if name is not None else "<random_name_" + str(int(random.random() * 10000000)) + ">"
+        img.save(self.segm_log_dir + "/" + save_name + ".png")
+
+    def check_if_and_log_segmented_image(self, img, borders, id=None, epoch=None):
+        name = "id_" + str(id) + "_epoch_" + str(epoch) if id is not None else None  # will have names with id, possibly overwriting each epoch, otherwise random ids
+        if self.random_segm_log_freq is not None:
+            if random.random() < self.random_segm_log_freq:
+                self.log_segmented_image(img, borders, name=name, convert_numbers_from_01=True)
+                # img = Image.fromarray(np.array(source[0]*255., dtype=np.int32)).convert('RGB')
+                # borders_here = segment_borders[0]
+                # draw = ImageDraw.Draw(img)
+                # for i in range(len(borders_here)):
+                #     if borders_here[i] != 0:
+                #         print("!", source[0].shape, i*scale_float)
+                #         draw.line([(round(i*scale_float), 0), (i*round(scale_float), 31)], fill='red', width=3)
+                # img.save(self.segm_log_dir + "/" + str(int(random.random() * 100000)) + ".png")
+        if self.segm_log_ids is not None:
+            assert id is not None  # need to use pass-metadata arg in criterion (if wav2vec, if other need to add this option)
+            for segm_log_rule in self.segm_log_ids:
+                if segm_log_rule(id):  # check if fits
+                    self.log_segmented_image(img, borders, name=name, convert_numbers_from_01=True)
+                    break
+
     def quantize(self, x):
         assert self.quantizer is not None
         x = self.feature_extractor(x)
diff --git a/fairseq_cli/train.py b/fairseq_cli/train.py
index e1af605348..3f76bb4fae 100644
--- a/fairseq_cli/train.py
+++ b/fairseq_cli/train.py
@@ -213,6 +213,8 @@ def train(
     should_stop = False
     num_updates = trainer.get_num_updates()
     for i, samples in enumerate(progress):
+        for sample in samples:
+            sample["epoch"] = epoch_itr.epoch
         with metrics.aggregate("train_inner"), torch.autograd.profiler.record_function(
             "train_step-%d" % i
         ):
diff --git a/uwr_related/test_cmd_scribble.sh b/uwr_related/test_cmd_scribble.sh
index cce3fa6d0b..d478eb1c8e 100755
--- a/uwr_related/test_cmd_scribble.sh
+++ b/uwr_related/test_cmd_scribble.sh
@@ -41,7 +41,7 @@ python train.py --distributed-world-size 1 --update-freq 2 \
   /pio/scratch/1/i283340/MGR/NewSetup/DistSup/data `#path to Scribblelens data folder` \
   --vocab-path ./fairseq/data/handwriting/tasman.alphabet.plus.space.mode5.json `#alphabet file` \
   --save-dir ../try_sl1 --num-workers 0 \
-  --task scribblelens --criterion wav2vec --arch wav2vec2_scribblelens \
+  --task scribblelens --criterion wav2vec `#--pass-metadata` --arch wav2vec2_scribblelens \
   --valid-subset test --pad-to-multiples-of 4 `#--max-sample-size 256` \
   --log-keys '["prob_perplexity","code_perplexity","temp"]' --quantize-targets --extractor-mode default \
   --conv-feature-layers '[(64, (3, 3), (1, 2), (1, 1)), (128, (5, 5), (2, 2), (2, 2)), (256, (3,3), (1, 1), (1, 1)), (256, (3,3), (1, 2), (1, 1)), (512, (3,3), (1, 1), (1, 1)), (512, (3,3), (1, 2), (1, 1)), (512, (3,2), (2, 1), (1, 0))]' \
@@ -57,5 +57,6 @@ python train.py --distributed-world-size 1 --update-freq 2 \
   --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 --max-tokens 10000 --max-update 400000 \
   --skip-invalid-size-inputs-valid-test --ddp-backend no_c10d \
   --labels `#can be removed for no labels` \
+  `#--segm-log-dir ../imgs3 --random-segm-log-freq 0.0001 --segm-log-ids =:715,%:1000:123` \
   `#--segm var  # optional segmentation` \ 
   --enable-padding # crashes without that, needs to make all lines same-size
\ No newline at end of file

From b45de5d956e38c735b4ecb7f2e570eac511c610e Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Wed, 16 Dec 2020 14:54:40 +0100
Subject: [PATCH 15/22] segmentation logging fixes

---
 fairseq/criterions/wav2vec_criterion.py         | 5 ++++-
 fairseq/models/wav2vec/wav2vec2_scribblelens.py | 4 ++--
 fairseq/trainer.py                              | 4 ++++
 fairseq_cli/train.py                            | 3 ++-
 4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/fairseq/criterions/wav2vec_criterion.py b/fairseq/criterions/wav2vec_criterion.py
index 3004ae16e7..dda994a63b 100644
--- a/fairseq/criterions/wav2vec_criterion.py
+++ b/fairseq/criterions/wav2vec_criterion.py
@@ -44,7 +44,10 @@ def forward(self, model, sample, reduce=True, log_pred=False):
         3) logging outputs to display while training
         """
         if self.pass_metadata:
-            net_output = model(**sample["net_input"], id=sample["id"], epoch=sample["epoch"])
+            # epoch is now also be passed in validation, but better be careful
+            net_output = model(**sample["net_input"], \
+                               id=sample["id"], \
+                               epoch=sample["epoch"].item() if "epoch" in sample else None)
         else:
             net_output = model(**sample["net_input"])
         logits = model.get_logits(net_output).float()
diff --git a/fairseq/models/wav2vec/wav2vec2_scribblelens.py b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
index fe1a9616f5..d2dae28364 100644
--- a/fairseq/models/wav2vec/wav2vec2_scribblelens.py
+++ b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
@@ -613,7 +613,7 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False, id=
 
             if self.segm_log_dir:
                 for i in range(source.shape[0]):
-                    self.check_if_and_log_segmented_image(source[i], [int(round(j*scale_float)) for j, k in enumerate(segment_borders[i]) if k.item() != 0], id=id[i] if id is not None else None, epoch=epoch)
+                    self.check_if_and_log_segmented_image(source[i], [int(round(j*scale_float)) for j, k in enumerate(segment_borders[i]) if k.item() != 0], id=id[i].item() if id is not None else None, epoch=epoch)
 
 
         unmasked_features = features.clone()
@@ -726,7 +726,7 @@ def segmentation(self, features, padding_mask, minSegmsPerLine):
 
     def log_segmented_image(self, img, borders, name=None, convert_numbers_from_01=True):
         converted_grayscale_img = img*255. if convert_numbers_from_01 else img
-        img = Image.fromarray(np.array(converted_grayscale_img, dtype=np.int32)).convert('RGB')
+        img = Image.fromarray(np.array(converted_grayscale_img.cpu(), dtype=np.int32)).convert('RGB')
         draw = ImageDraw.Draw(img)
         for border in borders:
             #if borders[i] != 0:
diff --git a/fairseq/trainer.py b/fairseq/trainer.py
index 19ca213d55..685531f77b 100644
--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -935,6 +935,10 @@ def _prepare_sample(self, sample):
                         sample["target"], device=self.last_device
                     )
             else:
+                # v needed if non-tensor stuff in sample (e.g. metadata), but kept tensors for safety
+                # for key in sample:
+                #     if torch.is_tensor(key):
+                #         sample[key] = utils.move_to_cuda(sample[key])
                 sample = utils.move_to_cuda(sample)
 
         def apply_half(t):
diff --git a/fairseq_cli/train.py b/fairseq_cli/train.py
index 3f76bb4fae..cda45607fd 100644
--- a/fairseq_cli/train.py
+++ b/fairseq_cli/train.py
@@ -214,7 +214,7 @@ def train(
     num_updates = trainer.get_num_updates()
     for i, samples in enumerate(progress):
         for sample in samples:
-            sample["epoch"] = epoch_itr.epoch
+            sample["epoch"] = torch.tensor(epoch_itr.epoch, dtype=torch.int16)
         with metrics.aggregate("train_inner"), torch.autograd.profiler.record_function(
             "train_step-%d" % i
         ):
@@ -354,6 +354,7 @@ def validate(
         # don't pollute other aggregators (e.g., train meters)
         with metrics.aggregate(new_root=True) as agg:
             for sample in progress:
+                sample["epoch"] = torch.tensor(epoch_itr.epoch, dtype=torch.int16)
                 trainer.valid_step(sample)
 
         # log validation stats

From e4d4a73d7f129ec4c76784495e23fb1a979f2760 Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Thu, 17 Dec 2020 22:29:22 +0100
Subject: [PATCH 16/22] added option for sqare error cost instead of
 variance/mse to use as priority for segment merging

---
 .../models/wav2vec/wav2vec2_scribblelens.py   | 42 +++++++++++---
 .../hierarchical_variance_segmentation.py     | 55 +++++++++++--------
 uwr_related/test_cmd_scribble.sh              |  2 +-
 3 files changed, 66 insertions(+), 33 deletions(-)

diff --git a/fairseq/models/wav2vec/wav2vec2_scribblelens.py b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
index d2dae28364..f14b000304 100644
--- a/fairseq/models/wav2vec/wav2vec2_scribblelens.py
+++ b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
@@ -303,7 +303,9 @@ def add_args(parser):
         )
 
         parser.add_argument(
-            "--segm", type=str, help="use segmentation on representations; 'var' (without ') for variance-based hierarchical segm"
+            "--segm", type=str, help="use segmentation on representations; 'var' (without ') for variance-based hierarchical segm; " \
+                + "also contains options, e.g. for var format is var:<segment_cost>:<batchavg_segment_nr_per_line>, where <segment_cost> is se or var, " \
+                + "and <batchavg_segment_nr_reduction_per_line> is/are float/floats of format <avg_reduction> or <min_avg_reduction>-<max_avg_reduction>"
         )
 
         parser.add_argument(
@@ -419,8 +421,26 @@ def __init__(self, args):
 
         self.final_proj = nn.Linear(args.encoder_embed_dim, final_dim)
 
+        # part for supported segmentation options
         if 'segm' in args:
-            self.segm = args.segm
+            segm_opts = args.segm.split(":")
+            # this part needs to set stuff needed by 'segmentation' method
+            if segm_opts[0] == "var":  # TODO maybe change name to hierarchical or so
+                self.segm = "var"
+                assert len(segm_opts) == 3
+                self.var_segm_merge_priority = segm_opts[1]
+                length_reduction_options = list(map(float, segm_opts[2].split("-")))
+                if len(length_reduction_options) == 1:
+                    self.var_segm_strict_reduction = length_reduction_options[0]
+                    self.var_segm_reduction_range = None
+                elif len(length_reduction_options) == 2:
+                    self.var_segm_strict_reduction = None
+                    assert length_reduction_options[0] <= length_reduction_options[1]
+                    self.var_segm_reduction_range = tuple(length_reduction_options)
+                else:
+                    assert False
+            else:
+                assert False  # for now only that supported
             if 'segm_log_dir' in args:
                 self.segm_log_dir = args.segm_log_dir
                 self.random_segm_log_freq = args.random_segm_log_freq if 'random_segm_log_freq' in args else None
@@ -613,7 +633,9 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False, id=
 
             if self.segm_log_dir:
                 for i in range(source.shape[0]):
-                    self.check_if_and_log_segmented_image(source[i], [int(round(j*scale_float)) for j, k in enumerate(segment_borders[i]) if k.item() != 0], id=id[i].item() if id is not None else None, epoch=epoch)
+                    # changed segment lines to only log begins (1 is there now for every segment, -1 if length > 1)
+                    # as can just mult by scale for begins, for ends would need to also add scale - 1
+                    self.check_if_and_log_segmented_image(source[i], [int(round(j*scale_float)) for j, k in enumerate(segment_borders[i]) if k.item() == -1], id=id[i].item() if id is not None else None, epoch=epoch)
 
 
         unmasked_features = features.clone()
@@ -719,10 +741,14 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False, id=
     def segmentation(self, features, padding_mask, minSegmsPerLine):
         assert self.segm == 'var'  # for now only that supported, to be extended
         non_padded = padding_mask.numel() - padding_mask.sum().item()
-        base_len_sum = non_padded / 3
-        min_segm = max(features.shape[0], int(round(0.85*base_len_sum)))
-        max_segm = min(non_padded, int(round(1.15*base_len_sum)))
-        return HierarchicalVarianceSegmentationLayer.apply(features, padding_mask, base_len_sum, None, minSegmsPerLine)
+        if self.var_segm_strict_reduction is not None:
+            base_len_sum = int(round(non_padded / self.var_segm_strict_reduction))
+            return HierarchicalVarianceSegmentationLayer.apply(features, padding_mask, base_len_sum, None, minSegmsPerLine, self.var_segm_merge_priority)
+        else:
+            min_reduction, max_reduction = self.var_segm_reduction_range
+            min_segm = base_len_sum = int(round(non_padded / max_reduction))  #max(features.shape[0], int(round(0.85*base_len_sum)))
+            max_segm = base_len_sum = int(round(non_padded / min_reduction))  #min(non_padded, int(round(1.15*base_len_sum)))
+            return HierarchicalVarianceSegmentationLayer.apply(features, padding_mask, None, (min_segm, max_segm), minSegmsPerLine, self.var_segm_merge_priority)
 
     def log_segmented_image(self, img, borders, name=None, convert_numbers_from_01=True):
         converted_grayscale_img = img*255. if convert_numbers_from_01 else img
@@ -731,7 +757,7 @@ def log_segmented_image(self, img, borders, name=None, convert_numbers_from_01=T
         for border in borders:
             #if borders[i] != 0:
             #print("!", source[0].shape, i*scale_float)
-            draw.line([(border, 0), (border, 31)], fill='red', width=3)
+            draw.line([(border, 0), (border, 31)], fill='red', width=2)
         save_name = name if name is not None else "<random_name_" + str(int(random.random() * 10000000)) + ">"
         img.save(self.segm_log_dir + "/" + save_name + ".png")
 
diff --git a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
index d51f245afd..b3e93ae564 100644
--- a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
+++ b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
@@ -6,16 +6,22 @@
 from torch.autograd import Function, Variable
 
 def variance(linearSum, squaresSum, size):
-    return np.sum((squaresSum / size) - np.square(linearSum / size))  # sum of "variance vector"
+    return np.sum((squaresSum / size) - np.square(linearSum / size))  # sum of "variance mse vector"
 
-def mse(linearSum, squaresSum, size=None):  # size is ignored, TODO remove from signature
-    return np.sum(squaresSum - np.square(linearSum))  # sum of "mse vector"
+def se(linearSum, squaresSum, size):  # square error
+    return np.sum(squaresSum - np.square(linearSum) / size)  # sum of "se vector"
 
 # [!] lines has to be a numpy array, np.sum() crashes if done on tensor
-# TODO actually now uses mse, as variance but not scaled with length; maybe add arg for that (?)
-def hierarchicalVarianceSegmentation(lines, padMask=None, k=None, minSegmsPerLine=None):  # k is sum of number of segments for all lines
+def hierarchicalVarianceSegmentation(lines, padMask=None, k=None, minSegmsPerLine=None, mergePriority="mse"):  # k is sum of number of segments for all lines
     
-   # TODO check if tensor parts correctly taken etc. [!]
+    if mergePriority == "se":  # var not divided by size, square error
+        costFun = lambda linSum, sqSum, size: se(linSum, sqSum, size)
+    elif mergePriority == "var":  # var is mse
+        costFun = lambda linSum, sqSum, size: variance(linSum, sqSum, size)
+    else:
+        assert False
+
+    # TODO check if tensor parts correctly taken etc. [!]
     segmentsDict = SegmentDict(lines, padMask=padMask, minSegmsPerLine=minSegmsPerLine)
     
     # maybe will need to change this to arrays or so instead of dicts for efficiency
@@ -31,9 +37,9 @@ def hierarchicalVarianceSegmentation(lines, padMask=None, k=None, minSegmsPerLin
             linSum2, sqSum2 = segmentsDict.getSegmentSums(segmRight)
             line1, left1, right1 = segm
             line2, left2, right2 = segmRight
-            oldVar1 = mse(linSum1, sqSum1, right1 - left1 + 1)
-            oldVar2 = mse(linSum2, sqSum2, right2 - left2 + 1)
-            mergedVariance = mse(linSum1 + linSum2, sqSum1 + sqSum2, right2 - left1 + 1)
+            oldVar1 = costFun(linSum1, sqSum1, right1 - left1 + 1)
+            oldVar2 = costFun(linSum2, sqSum2, right2 - left2 + 1)
+            mergedVariance = costFun(linSum1 + linSum2, sqSum1 + sqSum2, right2 - left1 + 1)
             heappush(q, (mergedVariance - oldVar1 - oldVar2, segm, segmRight))
        
     varChanges = []
@@ -54,20 +60,20 @@ def hierarchicalVarianceSegmentation(lines, padMask=None, k=None, minSegmsPerLin
         toRight = segmentsDict.getSegmentRight(merged)
         linSumMerged, sqSumMerged = segmentsDict.getSegmentSums(merged)
         lineMerged, leftMerged, rightMerged = merged
-        varMerged = mse(linSumMerged, sqSumMerged, rightMerged - leftMerged + 1)
+        varMerged = costFun(linSumMerged, sqSumMerged, rightMerged - leftMerged + 1)
         
         if toLeft is not None:
             linSum2, sqSum2 = segmentsDict.getSegmentSums(toLeft)
             line2, left2, right2 = toLeft
-            oldVar2 = mse(linSum2, sqSum2, right2 - left2 + 1)
-            mergedVariance = mse(linSumMerged + linSum2, sqSumMerged + sqSum2, rightMerged - left2 + 1)
+            oldVar2 = costFun(linSum2, sqSum2, right2 - left2 + 1)
+            mergedVariance = costFun(linSumMerged + linSum2, sqSumMerged + sqSum2, rightMerged - left2 + 1)
             heappush(q, (mergedVariance - varMerged - oldVar2, toLeft, merged))
             
         if toRight is not None:
             linSum2, sqSum2 = segmentsDict.getSegmentSums(toRight)
             line2, left2, right2 = toRight
-            oldVar2 = mse(linSum2, sqSum2, right2 - left2 + 1)
-            mergedVariance = mse(linSumMerged + linSum2, sqSumMerged + sqSum2, right2 - leftMerged + 1)
+            oldVar2 = costFun(linSum2, sqSum2, right2 - left2 + 1)
+            mergedVariance = costFun(linSumMerged + linSum2, sqSumMerged + sqSum2, right2 - leftMerged + 1)
             heappush(q, (mergedVariance - varMerged - oldVar2, merged, toRight))
             
     return varChanges, merges, segmentsDict
@@ -86,7 +92,7 @@ def flatten(x):
     # perhaps that ^ is not needed, and restore_shapes also
 
     @staticmethod
-    def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPerLine=None): 
+    def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPerLine=None, mergePriority="mse"): 
     # k for strict num of segments (SUM FOR ALL LINES), allowKsumRange for range OF SUM OF SEGMENTS IN ALL LINES and choosing 'best' split point
     # min and max number of merges adjusted to what is possible - e.g. because of minSegmsPerLine
 
@@ -102,7 +108,7 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPe
         # https://discuss.pytorch.org/t/cant-convert-cuda-tensor-to-numpy-use-tensor-cpu-to-copy-the-tensor-to-host-memory-first/38301 ,
         # https://discuss.pytorch.org/t/what-is-the-cpu-in-pytorch/15007/3
 
-        varChanges, merges, segmentsDict = hierarchicalVarianceSegmentation(input, padMask=padMask, k=k, minSegmsPerLine=minSegmsPerLine)  # won't modify input
+        varChanges, merges, segmentsDict = hierarchicalVarianceSegmentation(input, padMask=padMask, k=k, minSegmsPerLine=minSegmsPerLine, mergePriority=mergePriority)  # won't modify input
         #print("MERGES0: ", merges)
         if allowKsumRange:  # full merge done above, k=None, so each line now has minSegmsPerLine, but can also just get it from SegmDict - cleaner
             begin, end = allowKsumRange
@@ -150,8 +156,9 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPe
         for line, idxInLine in finalSegments.keys():
             line, begin, end = finalSegments[(line, idxInLine)]
             segmented[line][idxInLine] = np.mean(input[line][begin:(end+1)], axis=0)  #torch.mean(input[line][begin:(end+1)])
-            segmentBorders[line][begin] = 1
-            segmentBorders[line][end] = -1  # [!] can be e.g. [...0, 0, -1, -1, ...] with segment of length 1
+            segmentBorders[line][end] = -1  
+            segmentBorders[line][begin] = 1  # [!] can be e.g. [...0, 0, 1, 1, ...] with segment of length 1 
+            # - marking begins when length 1 as * scaling doesn't need + (scale-1) there if logging only begins
 
         resOutput = torch.tensor(segmented, dtype=inputGPU.dtype).to(inputDevice)   #if wasInputOnGPU else torch.tensor(segmented)  #.requires_grad_(True)
         resPadMask = torch.BoolTensor(paddingMaskOut).to(padMaskInputDevice)   #if wasPadMaskOnGPU else torch.BoolTensor(paddingMaskOut)
@@ -183,7 +190,7 @@ def backward(ctx, dxThrough, outPadMask=None, segmentBorders=None):  #, finalSeg
 
         dx = dx.to(dxThroughDevice)
 
-        return dx, None, None, None, None
+        return dx, None, None, None, None, None
 
 
 if __name__ == '__main__':
@@ -196,12 +203,12 @@ def backward(ctx, dxThrough, outPadMask=None, segmentBorders=None):  #, finalSeg
 
     tensor = torch.tensor([[[1,2],[1,2],[3,4],[3,4],[3,4],[8,9],[8,9]], [[1,2],[1,2],[3,4],[3,4],[3,4],[8,9],[8,9]]], dtype=torch.float64).requires_grad_(True)
     print(tensor[0][1])
-    print(hierarchicalVarianceSegmentation(tensor.detach().numpy(), padMask=None, k=4, minSegmsPerLine=None))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
-    print(hierarchicalVarianceSegmentation(tensor.detach().numpy(), padMask=None, k=2, minSegmsPerLine=None))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
+    print(hierarchicalVarianceSegmentation(tensor.detach().numpy(), padMask=None, k=4, minSegmsPerLine=None, mergePriority="se"))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
+    print(hierarchicalVarianceSegmentation(tensor.detach().numpy(), padMask=None, k=2, minSegmsPerLine=None, mergePriority="var"))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
 
     print("-------------------------- torch ---------------------------")
     # (tensor, padMask, k, kSumRange)
-    resOutput, resPadMask, borders = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), None, (2,5), None)  #(2, 5))  # can;t have keyword args for torch Functions...
+    resOutput, resPadMask, borders = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), None, (2,5), None, "var")  #(2, 5))  # can;t have keyword args for torch Functions...
     print(resOutput)
     print(resPadMask)
     print(borders)
@@ -214,7 +221,7 @@ def backward(ctx, dxThrough, outPadMask=None, segmentBorders=None):  #, finalSeg
     print("-------------------------- torch2 ---------------------------")
     # (tensor, padMask, k, kSumRange)
     tensor.grad.data.zero_()
-    resOutput, resPadMask, borders = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, None)  #(2, 5))  # can;t have keyword args for torch Functions...
+    resOutput, resPadMask, borders = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, None, "se")  #(2, 5))  # can;t have keyword args for torch Functions...
     print(resOutput)
     print(resPadMask)
     print(borders)
@@ -227,7 +234,7 @@ def backward(ctx, dxThrough, outPadMask=None, segmentBorders=None):  #, finalSeg
     print("-------------------------- torch3 ---------------------------")
     # (tensor, padMask, k, kSumRange)
     tensor.grad.data.zero_()
-    resOutput, resPadMask, borders = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, 2)  #(2, 5))  # can;t have keyword args for torch Functions...
+    resOutput, resPadMask, borders = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, 2, "se")  #(2, 5))  # can;t have keyword args for torch Functions...
     print(resOutput)
     print(resPadMask)
     print(borders)
diff --git a/uwr_related/test_cmd_scribble.sh b/uwr_related/test_cmd_scribble.sh
index d478eb1c8e..27701f8bfd 100755
--- a/uwr_related/test_cmd_scribble.sh
+++ b/uwr_related/test_cmd_scribble.sh
@@ -58,5 +58,5 @@ python train.py --distributed-world-size 1 --update-freq 2 \
   --skip-invalid-size-inputs-valid-test --ddp-backend no_c10d \
   --labels `#can be removed for no labels` \
   `#--segm-log-dir ../imgs3 --random-segm-log-freq 0.0001 --segm-log-ids =:715,%:1000:123` \
-  `#--segm var  # optional segmentation` \ 
+  `#--segm var:se:2.5-3.5  # optional segmentation` \ 
   --enable-padding # crashes without that, needs to make all lines same-size
\ No newline at end of file

From 47e30b14121673a06a70a933f177ea0cba41135b Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Tue, 22 Dec 2020 19:27:22 +0100
Subject: [PATCH 17/22] option for logging representations (with input images
 also)

---
 .../models/wav2vec/wav2vec2_scribblelens.py   | 129 +++++++++++-------
 uwr_related/test_cmd_scribble.sh              |   2 +-
 2 files changed, 80 insertions(+), 51 deletions(-)

diff --git a/fairseq/models/wav2vec/wav2vec2_scribblelens.py b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
index f14b000304..b8eaae7999 100644
--- a/fairseq/models/wav2vec/wav2vec2_scribblelens.py
+++ b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
@@ -309,18 +309,26 @@ def add_args(parser):
         )
 
         parser.add_argument(
-            "--segm-log-dir", type=str, help="where to log randomly chosen segmentation images; also serves as 'do log' flag"
+            "--log-ids", type=str, help="for what ids to log, format: <operator>:arg1,<operator>:arg1:arg2,... without spaces, " \
+            + "operator can be =(id) [=:id] or %(X, id) [%:1000:0], meaning exact id or ids of that modulo X"
         )
 
         parser.add_argument(
-            "--random-segm-log-freq", type=float, help="how frequently (pbb) to log randomly chosen segmentation images"
+            "--random-log-freq", type=float, help="how frequently (pbb) to log for randomly chosen ids"
         )
 
+        # to have some data logged, need to specify for which IDs (--log-ids and/or --random-log-freq) and what to log (flags below)
+
         parser.add_argument(
-            "--segm-log-ids", type=str, help="what ids to log, format: <operator>:arg1,<operator>:arg1:arg2,... without spaces, " \
-            + "operator can be =(id) [=:id] or %(X, id) [%:1000:0], meaning exact id or ids of that modulo X"
+            "--segm-log-dir", type=str, help="where to log chosen segmentation images; also serves as 'do log' flag"
         )
 
+        parser.add_argument(
+            "--repr-log-dir", type=str, help="where to log chosen representation data and raw input images; also serves as 'do log' flag"
+        )
+
+        
+
     def __init__(self, args):
         super().__init__()
         self.args = args
@@ -421,6 +429,26 @@ def __init__(self, args):
 
         self.final_proj = nn.Linear(args.encoder_embed_dim, final_dim)
 
+        # options for choosing ids to log for
+        self.random_log_freq = args.random_log_freq if 'random_log_freq' in args else None
+        if 'log_ids' in args:
+            options = args.log_ids.split(",")
+            self.log_ids = []
+            for opt in options:
+                details = opt.split(':')
+                if details[0] == "%":
+                    # need to bind details like that, because otherwise details variable will be bound to some random thing
+                    # that was used later and was also named details; 
+                    # imo one of the nastiest things in python, 
+                    # as scope in python is "until end of function" and not "until the end of the function or sth"
+                    self.log_ids.append((lambda details: (lambda x: x % int(details[1]) == int(details[2])))(details))
+                elif details[0] == '=':
+                    self.log_ids.append((lambda details: (lambda x: x == int(details[1])))(details))
+                else:
+                    assert False
+        else:
+            self.log_ids = None
+
         # part for supported segmentation options
         if 'segm' in args:
             segm_opts = args.segm.split(":")
@@ -443,31 +471,18 @@ def __init__(self, args):
                 assert False  # for now only that supported
             if 'segm_log_dir' in args:
                 self.segm_log_dir = args.segm_log_dir
-                self.random_segm_log_freq = args.random_segm_log_freq if 'random_segm_log_freq' in args else None
-                if 'segm_log_ids' in args:
-                    options = args.segm_log_ids.split(",")
-                    self.segm_log_ids = []
-                    for opt in options:
-                        details = opt.split(':')
-                        if details[0] == "%":
-                            # need to bind details like that, because otherwise details variable will be bound to some random thing
-                            # that was used later and was also named details; 
-                            # imo one of the nastiest things in python, 
-                            # as scope in python is "until end of function" and not "until the end of the function or sth"
-                            self.segm_log_ids.append((lambda details: (lambda x: x % int(details[1]) == int(details[2])))(details))
-                        elif details[0] == '=':
-                            self.segm_log_ids.append((lambda details: (lambda x: x == int(details[1])))(details))
-                        else:
-                            assert False
-                else:
-                    self.segm_log_ids = None
             else:
                 self.segm_log_dir = None
-                self.random_segm_log_freq = None
         else:
             self.segm = None
             self.segm_log_dir = None
-            self.random_segm_log_freq = None
+
+        if 'repr_log_dir' in args:
+            self.repr_log_dir = args.repr_log_dir
+        else:
+            self.repr_log_dir = None
+
+        self.need_logging = self.segm_log_dir is not None or self.repr_log_dir is not None
 
     def upgrade_state_dict_named(self, state_dict, name):
         super().upgrade_state_dict_named(state_dict, name)
@@ -626,17 +641,22 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False, id=
         else:
             scale_float = float(source.size(1)) / features.size(1)
 
-        # TODO add here a function for segmentation (+command line params) and update mask
         if self.segm:
             features, padding_mask, segment_borders = self.segmentation(features, padding_mask, 5)  
             # [!] minSegmsPerLine needs to be at least a few so that part with masking with at least 2 masks works correctly
 
-            if self.segm_log_dir:
-                for i in range(source.shape[0]):
-                    # changed segment lines to only log begins (1 is there now for every segment, -1 if length > 1)
-                    # as can just mult by scale for begins, for ends would need to also add scale - 1
-                    self.check_if_and_log_segmented_image(source[i], [int(round(j*scale_float)) for j, k in enumerate(segment_borders[i]) if k.item() == -1], id=id[i].item() if id is not None else None, epoch=epoch)
-
+        if self.need_logging:
+            for i in range(source.shape[0]):
+                if self.check_if_log_for_id(id=id[i].item()):
+                    if self.segm_log_dir:
+                        assert self.segm
+                        # changed segment lines to only log begins (1 is there now for every segment, -1 if length > 1)
+                        # as can just mult by scale for begins, for ends would need to also add scale - 1
+                        self.log_named_segmented_image(source[i], [int(round(j*scale_float)) for j, k in enumerate(segment_borders[i]) if k.item() == -1], id=id[i].item() if id is not None else None, epoch=epoch)
+                    if self.repr_log_dir:
+                        self.log_repr(source[i], features[i], id=id[i].item() if id is not None else None, epoch=epoch)
+                        # [!] logging here, before projection as this is what representation segmentation uses 
+                        # - TODO would otherwise need to change unmasked_features = features.clone() to be after projection instead of before
 
         unmasked_features = features.clone()
 
@@ -752,7 +772,9 @@ def segmentation(self, features, padding_mask, minSegmsPerLine):
 
     def log_segmented_image(self, img, borders, name=None, convert_numbers_from_01=True):
         converted_grayscale_img = img*255. if convert_numbers_from_01 else img
-        img = Image.fromarray(np.array(converted_grayscale_img.cpu(), dtype=np.int32)).convert('RGB')
+        if torch.is_tensor(converted_grayscale_img):
+            converted_grayscale_img = converted_grayscale_img.detach().cpu()
+        img = Image.fromarray(np.array(converted_grayscale_img, dtype=np.int32)).convert('RGB')
         draw = ImageDraw.Draw(img)
         for border in borders:
             #if borders[i] != 0:
@@ -761,25 +783,32 @@ def log_segmented_image(self, img, borders, name=None, convert_numbers_from_01=T
         save_name = name if name is not None else "<random_name_" + str(int(random.random() * 10000000)) + ">"
         img.save(self.segm_log_dir + "/" + save_name + ".png")
 
-    def check_if_and_log_segmented_image(self, img, borders, id=None, epoch=None):
-        name = "id_" + str(id) + "_epoch_" + str(epoch) if id is not None else None  # will have names with id, possibly overwriting each epoch, otherwise random ids
-        if self.random_segm_log_freq is not None:
-            if random.random() < self.random_segm_log_freq:
-                self.log_segmented_image(img, borders, name=name, convert_numbers_from_01=True)
-                # img = Image.fromarray(np.array(source[0]*255., dtype=np.int32)).convert('RGB')
-                # borders_here = segment_borders[0]
-                # draw = ImageDraw.Draw(img)
-                # for i in range(len(borders_here)):
-                #     if borders_here[i] != 0:
-                #         print("!", source[0].shape, i*scale_float)
-                #         draw.line([(round(i*scale_float), 0), (i*round(scale_float), 31)], fill='red', width=3)
-                # img.save(self.segm_log_dir + "/" + str(int(random.random() * 100000)) + ".png")
-        if self.segm_log_ids is not None:
+    def log_named_segmented_image(self, img, borders, id=None, epoch=None):
+        name = "segm_id_" + str(id) + "_epoch_" + str(epoch) if id is not None else None  # will have names with id, possibly overwriting each epoch, otherwise random ids
+        self.log_segmented_image(img, borders, name=name, convert_numbers_from_01=True)
+
+    def log_repr(self, img, features, id=None, epoch=None):
+        if torch.is_tensor(img):
+            img = img.detach().cpu()
+        if torch.is_tensor(features):
+            features = features.detach().cpu()
+        img_np = np.array(img)
+        features_np = np.array(features)
+        img_name = "input_id_" + str(id) + "_epoch_" + str(epoch) if id is not None else None  # will have names with id, possibly overwriting each epoch, otherwise random ids
+        features_name = "features_id_" + str(id) + "_epoch_" + str(epoch) if id is not None else None  # will have names with id, possibly overwriting each epoch, otherwise random ids
+        np.save(self.repr_log_dir + "/" + img_name, img_np)
+        np.save(self.repr_log_dir + "/" + features_name, features_np)
+
+    def check_if_log_for_id(self, id=None):
+        if self.random_log_freq is not None:
+            if random.random() < self.random_log_freq:
+                return True
+        if self.log_ids is not None:
             assert id is not None  # need to use pass-metadata arg in criterion (if wav2vec, if other need to add this option)
-            for segm_log_rule in self.segm_log_ids:
-                if segm_log_rule(id):  # check if fits
-                    self.log_segmented_image(img, borders, name=name, convert_numbers_from_01=True)
-                    break
+            for log_rule in self.log_ids:
+                if log_rule(id):  # check if fits
+                    return True
+        return False
 
     def quantize(self, x):
         assert self.quantizer is not None
diff --git a/uwr_related/test_cmd_scribble.sh b/uwr_related/test_cmd_scribble.sh
index 27701f8bfd..a3db5b0f45 100755
--- a/uwr_related/test_cmd_scribble.sh
+++ b/uwr_related/test_cmd_scribble.sh
@@ -57,6 +57,6 @@ python train.py --distributed-world-size 1 --update-freq 2 \
   --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 --max-tokens 10000 --max-update 400000 \
   --skip-invalid-size-inputs-valid-test --ddp-backend no_c10d \
   --labels `#can be removed for no labels` \
-  `#--segm-log-dir ../imgs3 --random-segm-log-freq 0.0001 --segm-log-ids =:715,%:1000:123` \
+  `#--segm-log-dir ../imgs3 --repr-log-dir ../repr3 --random-log-freq 0.0001 --log-ids =:715,%:1000:123` \
   `#--segm var:se:2.5-3.5  # optional segmentation` \ 
   --enable-padding # crashes without that, needs to make all lines same-size
\ No newline at end of file

From e895131013a4a1c6ecd686651d41a4217749dae0 Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Mon, 28 Dec 2020 22:44:03 +0100
Subject: [PATCH 18/22] initial representation similarity plotting with some
 fixes

---
 .../models/wav2vec/wav2vec2_scribblelens.py   |  40 +++++--
 uwr_related/experiments/pp/gen_sim_imgs.py    | 106 ++++++++++++++++++
 uwr_related/test_cmd_scribble.sh              |   2 +-
 3 files changed, 135 insertions(+), 13 deletions(-)
 create mode 100644 uwr_related/experiments/pp/gen_sim_imgs.py

diff --git a/fairseq/models/wav2vec/wav2vec2_scribblelens.py b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
index b8eaae7999..5f0bf5e53c 100644
--- a/fairseq/models/wav2vec/wav2vec2_scribblelens.py
+++ b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
@@ -324,7 +324,7 @@ def add_args(parser):
         )
 
         parser.add_argument(
-            "--repr-log-dir", type=str, help="where to log chosen representation data and raw input images; also serves as 'do log' flag"
+            "--repr-data-log-dir", type=str, help="where to log chosen array data (representation data, raw input images, and segment borders if segmentation); also serves as 'do log' flag"
         )
 
         
@@ -477,12 +477,12 @@ def __init__(self, args):
             self.segm = None
             self.segm_log_dir = None
 
-        if 'repr_log_dir' in args:
-            self.repr_log_dir = args.repr_log_dir
+        if 'repr_data_log_dir' in args:
+            self.repr_data_log_dir = args.repr_data_log_dir
         else:
-            self.repr_log_dir = None
+            self.repr_data_log_dir = None
 
-        self.need_logging = self.segm_log_dir is not None or self.repr_log_dir is not None
+        self.need_logging = self.segm_log_dir is not None or self.repr_data_log_dir is not None
 
     def upgrade_state_dict_named(self, state_dict, name):
         super().upgrade_state_dict_named(state_dict, name)
@@ -641,6 +641,15 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False, id=
         else:
             scale_float = float(source.size(1)) / features.size(1)
 
+        # TODO maybe move logging segments images to a script also and here just log borders array? or option both here and as a script potentially
+        if self.need_logging:
+            for i in range(source.shape[0]):
+                if self.check_if_log_for_id(id=id[i].item()):
+                    if self.repr_data_log_dir:
+                        self.log_repr_nonsegmentation_data(source[i], features[i], id=id[i].item() if id is not None else None, epoch=epoch)
+                        # [!] logging here, before projection as this is what representation segmentation uses 
+                        # - TODO would otherwise need to change unmasked_features = features.clone() to be after projection instead of before
+
         if self.segm:
             features, padding_mask, segment_borders = self.segmentation(features, padding_mask, 5)  
             # [!] minSegmsPerLine needs to be at least a few so that part with masking with at least 2 masks works correctly
@@ -652,9 +661,9 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False, id=
                         assert self.segm
                         # changed segment lines to only log begins (1 is there now for every segment, -1 if length > 1)
                         # as can just mult by scale for begins, for ends would need to also add scale - 1
-                        self.log_named_segmented_image(source[i], [int(round(j*scale_float)) for j, k in enumerate(segment_borders[i]) if k.item() == -1], id=id[i].item() if id is not None else None, epoch=epoch)
-                    if self.repr_log_dir:
-                        self.log_repr(source[i], features[i], id=id[i].item() if id is not None else None, epoch=epoch)
+                        self.log_named_segmented_image(source[i], [int(round(j*scale_float)) for j, k in enumerate(segment_borders[i]) if k.item() == 1], id=id[i].item() if id is not None else None, epoch=epoch)
+                    if self.repr_data_log_dir and self.segm:
+                        self.log_repr_segmentation_data(segment_borders[i], id=id[i].item() if id is not None else None, epoch=epoch)
                         # [!] logging here, before projection as this is what representation segmentation uses 
                         # - TODO would otherwise need to change unmasked_features = features.clone() to be after projection instead of before
 
@@ -787,7 +796,7 @@ def log_named_segmented_image(self, img, borders, id=None, epoch=None):
         name = "segm_id_" + str(id) + "_epoch_" + str(epoch) if id is not None else None  # will have names with id, possibly overwriting each epoch, otherwise random ids
         self.log_segmented_image(img, borders, name=name, convert_numbers_from_01=True)
 
-    def log_repr(self, img, features, id=None, epoch=None):
+    def log_repr_nonsegmentation_data(self, img, features, id=None, epoch=None):
         if torch.is_tensor(img):
             img = img.detach().cpu()
         if torch.is_tensor(features):
@@ -796,9 +805,16 @@ def log_repr(self, img, features, id=None, epoch=None):
         features_np = np.array(features)
         img_name = "input_id_" + str(id) + "_epoch_" + str(epoch) if id is not None else None  # will have names with id, possibly overwriting each epoch, otherwise random ids
         features_name = "features_id_" + str(id) + "_epoch_" + str(epoch) if id is not None else None  # will have names with id, possibly overwriting each epoch, otherwise random ids
-        np.save(self.repr_log_dir + "/" + img_name, img_np)
-        np.save(self.repr_log_dir + "/" + features_name, features_np)
-
+        np.save(self.repr_data_log_dir + "/" + img_name, img_np)
+        np.save(self.repr_data_log_dir + "/" + features_name, features_np)
+
+    def log_repr_segmentation_data(self, borders, id=None, epoch=None):
+        if torch.is_tensor(borders):
+            borders = borders.detach().cpu()
+        borders_np = np.array(borders)
+        borders_name = "segmentborders_id_" + str(id) + "_epoch_" + str(epoch) if id is not None else None  # will have names with id, possibly overwriting each epoch, otherwise random ids
+        np.save(self.repr_data_log_dir + "/" + borders_name, borders_np)
+        
     def check_if_log_for_id(self, id=None):
         if self.random_log_freq is not None:
             if random.random() < self.random_log_freq:
diff --git a/uwr_related/experiments/pp/gen_sim_imgs.py b/uwr_related/experiments/pp/gen_sim_imgs.py
new file mode 100644
index 0000000000..a413e1e71c
--- /dev/null
+++ b/uwr_related/experiments/pp/gen_sim_imgs.py
@@ -0,0 +1,106 @@
+
+import sys
+import os
+import numpy as np
+from PIL import Image, ImageDraw
+#import matplotlib.pyplot as plt
+
+assert len(sys.argv) == 2 or len(sys.argv) == 3  # pass directory with data as an arg and also (default 50) for how many pixels there should be a helper grid line
+#print(sys.argv[0], sys.argv[1])
+
+# TODO a mode with grid on segment borders
+gridHz = int(sys.argv[2]) if len(sys.argv) == 3 else 50
+
+# this should map for whole array, can e.g. use numpy etc. then
+def mapSimFromDist(numArr, minVal=0., maxVal=1.):
+    # this one only makes sense with >= 0 values
+    # maps to one colour
+    # TODO logarithmic option??
+    #minRev = 1./maxVal
+    #maxRev = 1./maxVal
+    #mapped = (1./num - minRev) / maxRev
+    if maxVal == 0:
+        maxVal = 1
+    return (float(maxVal) - (numArr - minVal)) / (maxVal - minVal)
+
+dct = {}
+for f in os.scandir(sys.argv[1]):
+    els = f.name.split("_")
+    t = els[0]
+    rest = "_".join(els[1:])
+    dct[(t,rest)] = f.name
+
+print(dct)
+
+print("===================", mapSimFromDist(0.), mapSimFromDist(1.))
+
+for t, rest in dct:
+    if t != "input":
+        continue
+    if ("features", rest) not in dct:
+        continue
+    inputFile = sys.argv[1] + "/" + t + "_" + rest
+    featuresFile = sys.argv[1] + "/" + "features" + "_" + rest
+    print("processing", inputFile, featuresFile)
+    inputArr = np.load(inputFile)
+    featuresArr = np.load(featuresFile).T
+
+    # calculating array of distances between the representations; TODO add some params for different options, also similarity without distance in between
+    inputSq = np.square(inputArr).sum(axis=0)
+    featuresSq = np.square(featuresArr).sum(axis=0)
+    #print(inputArr.shape, inputSq.shape, featuresArr.shape, featuresArr.T.shape, featuresSq.shape, featuresSq[np.newaxis,:].T.shape)
+    distArr = featuresSq + featuresSq[np.newaxis,:].T - 2. * np.matmul(featuresArr.T, featuresArr)
+    #print(distArr.shape)
+
+    bigArr = np.zeros((inputArr.shape[0] + 3 + inputArr.shape[1], inputArr.shape[0] + 3 + inputArr.shape[1], 3))
+    #print(bigArr.shape, inputArr.shape)
+
+    # adding input images on the top and on the left and blue lines to an image
+    bigArr[:inputArr.shape[0], (inputArr.shape[0]+3):(inputArr.shape[0]+3+inputArr.shape[1]), :] = inputArr[:,:,np.newaxis]
+    bigArr[(inputArr.shape[0]+3):(inputArr.shape[0]+3+inputArr.shape[1]), :inputArr.shape[0], :] = np.flip(inputArr, axis=0).T[:,:,np.newaxis]
+    bigArr[inputArr.shape[0]:(inputArr.shape[0] + 3), (inputArr.shape[0]):(inputArr.shape[0]+3+inputArr.shape[1]), 2] = 1.
+    bigArr[(inputArr.shape[0]):(inputArr.shape[0]+3+inputArr.shape[1]), inputArr.shape[0]:(inputArr.shape[0]  +3), 2] = 1.
+    a1 = ((inputArr.shape[0], inputArr.shape[0] + 3), (inputArr.shape[0], inputArr.shape[0]+3+inputArr.shape[1]))
+    a2 = ((inputArr.shape[0], inputArr.shape[0]+3+inputArr.shape[1]), (inputArr.shape[0], inputArr.shape[0]  +3))
+    #print("!!!", distArr[:2,:2])
+    #sim = plt.imshow(distArr, cmap='viridis', )
+    scaleFloat = float(inputArr.shape[1]) / float(distArr.shape[0])
+    #print("---->", scaleFloat, inputArr.shape[1], distArr.shape[0])
+
+    # creating similarity array from distance array; TODO as mentioned where creating distArr, add some params for different options, also without dist, like e.g. cosine sim
+    minVal = distArr.min()
+    maxVal = distArr.max()
+    simArr = mapSimFromDist(distArr, minVal, maxVal)
+
+    # converting stuff to 0-255 (but not ints where not needed yet) and increasing size 2x
+    bigArr = bigArr * 255.  # scaling here, as similarity scaled separately below
+    #print("!!!", bigArr.shape, np.ones((2,2)).shape)
+    bigArr = bigArr.repeat(2, axis=0).repeat(2, axis=1)
+    #print("!!!", bigArr.shape)
+    bigArr[2*(inputArr.shape[0] + 3):, 2*(inputArr.shape[0] + 3):, 0] = np.asarray(Image.fromarray(np.array(simArr*255., dtype=np.int8)).resize((inputArr.shape[1]*2, inputArr.shape[1]*2), resample=Image.NEAREST))
+
+    # from here stuff is 2x bigger (as grid lines were otherwise too big)
+
+    # choosing helper grid positions to plot
+    if ("segmentborders", rest) in dct and len(sys.argv) == 2:  # ONLY plot as segments if no grid density specified
+        bordersFile = featuresFile = sys.argv[1] + "/" + "segmentborders" + "_" + rest
+        bordersArr = np.load(bordersFile)
+        #print("--------", bordersArr)
+        gridBorders = [ 2*(inputArr.shape[0] + 3) + 2*int(round(j*scaleFloat)) for j, k in enumerate(bordersArr) if k == 1]
+        #print("[][][][]", gridBorders)
+    else:
+        gridBorders = range(2*(inputArr.shape[0]+3), 2*(inputArr.shape[0]+3+inputArr.shape[1]), gridHz*2)
+
+    # plotting helper grid
+    for i in gridBorders:
+        # grid helper
+        bigArr[i, 2*(inputArr.shape[0]+3):, :] = 255
+        bigArr[2*(inputArr.shape[0]+3):, i, :] = 255
+    # for i in range(inputArr.shape[1]):
+    #     for j in range(inputArr.shape[1]):
+    #         bigArr[inputArr.shape[0] + 3 + i, inputArr.shape[0] + 3 + j][0] = mapSimFromDist(distArr[int(i / scaleFloat)][int(j / scaleFloat)], minVal, maxVal)
+    
+    # saving image from array
+    img = Image.fromarray(np.array(bigArr, dtype=np.int8), 'RGB')  #.resize((bigArr.shape[0]*2, bigArr.shape[1]*2))  # PIL needs EXPLICIT int8, won't understand that int32 of values <256 is int8
+    img.save(sys.argv[1] + "/" + "visualization" + "_" + rest.split(".")[0] + ".png")
+    #img.show()
diff --git a/uwr_related/test_cmd_scribble.sh b/uwr_related/test_cmd_scribble.sh
index a3db5b0f45..70a3ea3bd3 100755
--- a/uwr_related/test_cmd_scribble.sh
+++ b/uwr_related/test_cmd_scribble.sh
@@ -57,6 +57,6 @@ python train.py --distributed-world-size 1 --update-freq 2 \
   --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 --max-tokens 10000 --max-update 400000 \
   --skip-invalid-size-inputs-valid-test --ddp-backend no_c10d \
   --labels `#can be removed for no labels` \
-  `#--segm-log-dir ../imgs3 --repr-log-dir ../repr3 --random-log-freq 0.0001 --log-ids =:715,%:1000:123` \
+  `#--segm-log-dir ../imgs3 --repr-data-log-dir ../repr3 --random-log-freq 0.0001 --log-ids =:715,%:1000:123` \
   `#--segm var:se:2.5-3.5  # optional segmentation` \ 
   --enable-padding # crashes without that, needs to make all lines same-size
\ No newline at end of file

From 544d3356bb57158c5ac3005d185cd2281f41c412 Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Sat, 2 Jan 2021 22:00:23 +0100
Subject: [PATCH 19/22] cosine distance option and preparation for different
 segment shortening options

---
 .../models/wav2vec/wav2vec2_scribblelens.py   | 15 +++--
 .../hierarchical_variance_segmentation.py     | 57 ++++++++++++++-----
 2 files changed, 54 insertions(+), 18 deletions(-)

diff --git a/fairseq/models/wav2vec/wav2vec2_scribblelens.py b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
index 5f0bf5e53c..1eb4b09c15 100644
--- a/fairseq/models/wav2vec/wav2vec2_scribblelens.py
+++ b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
@@ -304,9 +304,13 @@ def add_args(parser):
 
         parser.add_argument(
             "--segm", type=str, help="use segmentation on representations; 'var' (without ') for variance-based hierarchical segm; " \
-                + "also contains options, e.g. for var format is var:<segment_cost>:<batchavg_segment_nr_per_line>, where <segment_cost> is se or var, " \
-                + "and <batchavg_segment_nr_reduction_per_line> is/are float/floats of format <avg_reduction> or <min_avg_reduction>-<max_avg_reduction>"
-        )
+                + "also contains options, e.g. for var format is var:<segment_cost>:<shortening_mode>:<batchavg_segment_nr_per_line>, where:\n" \
+                + " i) <segment_cost> is se (squared error), var (variance, se div by length), cos (cosine similarity mapped linearly to distance metric and scaled with segment length) \n" \
+                + " ii) <shortening_mode> is one of: shorten (averages in segments and replaces each with length 1), orig_len (replace with mean in segments, but keep length), " \
+                + "orig_len_guess_orig (as in orig_len, but use original not-averaged representations as masked ones to guess correct one from)\n" \
+                + " iii) <batchavg_segment_nr_reduction_per_line> is/are float/floats of format <avg_reduction> or <min_avg_reduction>-<max_avg_reduction>\n"
+        )  # TODO add option for reconstruction/rounding loss; maybe also think about an option with ~constant length reduction (but at least one piece each segment) so that 
+           #      the long segments are not complete random averaged stuff
 
         parser.add_argument(
             "--log-ids", type=str, help="for what ids to log, format: <operator>:arg1,<operator>:arg1:arg2,... without spaces, " \
@@ -453,10 +457,11 @@ def __init__(self, args):
         if 'segm' in args:
             segm_opts = args.segm.split(":")
             # this part needs to set stuff needed by 'segmentation' method
-            if segm_opts[0] == "var":  # TODO maybe change name to hierarchical or so
+            if segm_opts[0] == "var":  # TODO change name to hierarchical or so and add cosinus dist option
                 self.segm = "var"
-                assert len(segm_opts) == 3
+                assert len(segm_opts) == 4
                 self.var_segm_merge_priority = segm_opts[1]
+                self.var_segm_length_policy = segm_opts[2]
                 length_reduction_options = list(map(float, segm_opts[2].split("-")))
                 if len(length_reduction_options) == 1:
                     self.var_segm_strict_reduction = length_reduction_options[0]
diff --git a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
index b3e93ae564..466788ff5e 100644
--- a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
+++ b/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
@@ -11,13 +11,32 @@ def variance(linearSum, squaresSum, size):
 def se(linearSum, squaresSum, size):  # square error
     return np.sum(squaresSum - np.square(linearSum) / size)  # sum of "se vector"
 
+def varianceDiff(linearSum1, squaresSum1, size1, linearSum2, squaresSum2, size2):
+    return variance(linearSum1 + linearSum2, squaresSum1 + squaresSum2, size1 + size2) - variance(linearSum1, squaresSum1, size1) - variance(linearSum2, squaresSum2, size2)
+
+def seDiff(linearSum1, squaresSum1, size1, linearSum2, squaresSum2, size2):
+    return se(linearSum1 + linearSum2, squaresSum1 + squaresSum2, size1 + size2) - se(linearSum1, squaresSum1, size1) - se(linearSum2, squaresSum2, size2)
+
+def cosDist(linearSum1, squaresSum1, size1, linearSum2, squaresSum2, size2):  # cosine distance
+    unscaledSim = np.dot(linearSum1, linearSum2) / (np.sqrt(np.dot(linearSum1, linearSum1)) * np.sqrt(np.dot(linearSum2, linearSum2)))
+    unscaledAsDist = -unscaledSim + 1.  # change from similarity to distance; we mainly care about order for priority queue, for that any mapping reversing order is ok (low similarity = high distance)
+    # ^ here we have a change from [-1, 1] to [0, 2]; standard "cosine distance"
+    return unscaledAsDist * (size1 + size2)  
+    # scaling so that big nonsense averaged almost-random segments don't appear as similar (randomnoise1 ~= randomnoise2)
+    # this is where changing form similarity to distance mapping can make a difference, but linear one seems ok
+    # this scaling is similar to the sum of distances of all elements to the average of the another segment and vice versa (can use sums instead of averages for cosine sim; 
+    # but that's perhaps not exactly this sum as cosine_similarity ( (sum_i a_i) , x ) is not the same as (sum_i cosine_similarity ( a_i , x )) )
+    # but the other one would be more expensive to compute
+
 # [!] lines has to be a numpy array, np.sum() crashes if done on tensor
 def hierarchicalVarianceSegmentation(lines, padMask=None, k=None, minSegmsPerLine=None, mergePriority="mse"):  # k is sum of number of segments for all lines
     
     if mergePriority == "se":  # var not divided by size, square error
-        costFun = lambda linSum, sqSum, size: se(linSum, sqSum, size)
+        costFun = seDiff  #lambda linearSum1, squaresSum1, size1, linearSum2, squaresSum2, size2: seDiff(linearSum1, squaresSum1, size1, linearSum2, squaresSum2, size2)
     elif mergePriority == "var":  # var is mse
-        costFun = lambda linSum, sqSum, size: variance(linSum, sqSum, size)
+        costFun = varianceDiff  #lambda linearSum1, squaresSum1, size1, linearSum2, squaresSum2, size2: varianceDiff(linearSum1, squaresSum1, size1, linearSum2, squaresSum2, size2)
+    elif mergePriority == "cos":
+        costFun = cosDist  #lambda linearSum1, squaresSum1, size1, linearSum2, squaresSum2, size2: cos(linearSum1, squaresSum1, size1, linearSum2, squaresSum2, size2)
     else:
         assert False
 
@@ -37,10 +56,13 @@ def hierarchicalVarianceSegmentation(lines, padMask=None, k=None, minSegmsPerLin
             linSum2, sqSum2 = segmentsDict.getSegmentSums(segmRight)
             line1, left1, right1 = segm
             line2, left2, right2 = segmRight
-            oldVar1 = costFun(linSum1, sqSum1, right1 - left1 + 1)
-            oldVar2 = costFun(linSum2, sqSum2, right2 - left2 + 1)
-            mergedVariance = costFun(linSum1 + linSum2, sqSum1 + sqSum2, right2 - left1 + 1)
-            heappush(q, (mergedVariance - oldVar1 - oldVar2, segm, segmRight))
+            #oldVar1 = costFun(linSum1, sqSum1, right1 - left1 + 1)
+            #oldVar2 = costFun(linSum2, sqSum2, right2 - left2 + 1)
+            #mergedVariance = costFun(linSum1 + linSum2, sqSum1 + sqSum2, right2 - left1 + 1)
+            size1 = right1 - left1 + 1
+            size2 = right2 - left2 + 1
+            costDiff = costFun(linSum1, sqSum1, size1, linSum2, sqSum2, size2)
+            heappush(q, (costDiff, segm, segmRight))
        
     varChanges = []
     merges = []
@@ -60,21 +82,26 @@ def hierarchicalVarianceSegmentation(lines, padMask=None, k=None, minSegmsPerLin
         toRight = segmentsDict.getSegmentRight(merged)
         linSumMerged, sqSumMerged = segmentsDict.getSegmentSums(merged)
         lineMerged, leftMerged, rightMerged = merged
-        varMerged = costFun(linSumMerged, sqSumMerged, rightMerged - leftMerged + 1)
+        sizeMerged = rightMerged - leftMerged + 1
+        #varMerged = costFun(linSumMerged, sqSumMerged, rightMerged - leftMerged + 1)
         
         if toLeft is not None:
             linSum2, sqSum2 = segmentsDict.getSegmentSums(toLeft)
             line2, left2, right2 = toLeft
-            oldVar2 = costFun(linSum2, sqSum2, right2 - left2 + 1)
-            mergedVariance = costFun(linSumMerged + linSum2, sqSumMerged + sqSum2, rightMerged - left2 + 1)
-            heappush(q, (mergedVariance - varMerged - oldVar2, toLeft, merged))
+            size2 = right2 - left2 + 1
+            #oldVar2 = costFun(linSum2, sqSum2, right2 - left2 + 1)
+            #mergedVariance = costFun(linSumMerged + linSum2, sqSumMerged + sqSum2, rightMerged - left2 + 1)
+            costDiff = costFun(linSumMerged, sqSumMerged, sizeMerged, linSum2, sqSum2, size2)
+            heappush(q, (costDiff, toLeft, merged))
             
         if toRight is not None:
             linSum2, sqSum2 = segmentsDict.getSegmentSums(toRight)
             line2, left2, right2 = toRight
-            oldVar2 = costFun(linSum2, sqSum2, right2 - left2 + 1)
-            mergedVariance = costFun(linSumMerged + linSum2, sqSumMerged + sqSum2, right2 - leftMerged + 1)
-            heappush(q, (mergedVariance - varMerged - oldVar2, merged, toRight))
+            size2 = right2 - left2 + 1
+            #oldVar2 = costFun(linSum2, sqSum2, right2 - left2 + 1)
+            #mergedVariance = costFun(linSumMerged + linSum2, sqSumMerged + sqSum2, right2 - leftMerged + 1)
+            costDiff = costFun(linSumMerged, sqSumMerged, sizeMerged, linSum2, sqSum2, size2)
+            heappush(q, (costDiff, merged, toRight))
             
     return varChanges, merges, segmentsDict
 
@@ -103,6 +130,8 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPe
         inputDevice = inputGPU.device
         padMaskInputDevice = padMask.device if padMask is not None else False
 
+        # TODO TODO add shortening policy stuff !
+
         # tensor to CPU  (don't really need copy, will just need to put tensors in segmentsDict)
         input = inputGPU.detach().to('cpu').numpy()  
         # https://discuss.pytorch.org/t/cant-convert-cuda-tensor-to-numpy-use-tensor-cpu-to-copy-the-tensor-to-host-memory-first/38301 ,
@@ -145,6 +174,8 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPe
         #print("MERGES: ", merges)
         #print("FINAL SEGMENTS: ", finalSegments)
 
+        # TODO change from here (and also change backward) depending on shortening policy [!]
+
         maxSegments = max(segmentNumsInLines)
         paddingMaskOut = np.full((input.shape[0], maxSegments), False)  #torch.BoolTensor(size=(input.shape[0], maxSegments)).fill_(False)
         for i, n in enumerate(segmentNumsInLines):

From b6dc688c1ad30272de6cc87719eefc0c28f830f3 Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Mon, 4 Jan 2021 19:04:50 +0100
Subject: [PATCH 20/22] added options with segmentation with only averaging
 without shortening, also with possibility of guessing non-averaged original
 representations; also did some renaming

---
 .../models/wav2vec/wav2vec2_scribblelens.py   | 43 +++++++++++--------
 fairseq/modules/__init__.py                   |  6 +--
 ...tation.py => hierarchical_segmentation.py} | 37 ++++++++++------
 uwr_related/test_cmd_scribble.sh              |  2 +-
 4 files changed, 54 insertions(+), 34 deletions(-)
 rename fairseq/modules/segmentation/{hierarchical_variance_segmentation.py => hierarchical_segmentation.py} (84%)

diff --git a/fairseq/models/wav2vec/wav2vec2_scribblelens.py b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
index 1eb4b09c15..34c3fe89c8 100644
--- a/fairseq/models/wav2vec/wav2vec2_scribblelens.py
+++ b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
@@ -25,7 +25,7 @@
     MultiheadAttention,
     SamePad,
     TransposeLast,
-    HierarchicalVarianceSegmentationLayer,
+    HierarchicalSegmentationLayer,
 )
 from fairseq.modules.transformer_sentence_encoder import init_bert_params
 from fairseq.utils import buffered_arange
@@ -303,11 +303,11 @@ def add_args(parser):
         )
 
         parser.add_argument(
-            "--segm", type=str, help="use segmentation on representations; 'var' (without ') for variance-based hierarchical segm; " \
-                + "also contains options, e.g. for var format is var:<segment_cost>:<shortening_mode>:<batchavg_segment_nr_per_line>, where:\n" \
+            "--segm", type=str, help="use segmentation on representations; 'hier' (without ') for hierarchical segm; " \
+                + "also contains options, e.g. for var format is hier:<segment_cost>:<shortening_mode>:<batchavg_segment_nr_per_line>, where:\n" \
                 + " i) <segment_cost> is se (squared error), var (variance, se div by length), cos (cosine similarity mapped linearly to distance metric and scaled with segment length) \n" \
                 + " ii) <shortening_mode> is one of: shorten (averages in segments and replaces each with length 1), orig_len (replace with mean in segments, but keep length), " \
-                + "orig_len_guess_orig (as in orig_len, but use original not-averaged representations as masked ones to guess correct one from)\n" \
+                + "orig_len&guess_orig (as in orig_len, but use original not-averaged representations as masked ones to guess correct one from)\n" \
                 + " iii) <batchavg_segment_nr_reduction_per_line> is/are float/floats of format <avg_reduction> or <min_avg_reduction>-<max_avg_reduction>\n"
         )  # TODO add option for reconstruction/rounding loss; maybe also think about an option with ~constant length reduction (but at least one piece each segment) so that 
            #      the long segments are not complete random averaged stuff
@@ -457,19 +457,21 @@ def __init__(self, args):
         if 'segm' in args:
             segm_opts = args.segm.split(":")
             # this part needs to set stuff needed by 'segmentation' method
-            if segm_opts[0] == "var":  # TODO change name to hierarchical or so and add cosinus dist option
+            if segm_opts[0] == "hier":
                 self.segm = "var"
                 assert len(segm_opts) == 4
-                self.var_segm_merge_priority = segm_opts[1]
-                self.var_segm_length_policy = segm_opts[2]
+                self.hier_segm_merge_priority = segm_opts[1]
+                shorten_opts = segm_opts[2].split("&")
+                self.hier_segm_shortening_policy = shorten_opts[0]
+                self.hier_segm_guess_orig = len(shorten_opts) > 1 and shorten_opts[1] == "guess_orig"
                 length_reduction_options = list(map(float, segm_opts[2].split("-")))
                 if len(length_reduction_options) == 1:
-                    self.var_segm_strict_reduction = length_reduction_options[0]
-                    self.var_segm_reduction_range = None
+                    self.hier_segm_strict_reduction = length_reduction_options[0]
+                    self.hier_segm_reduction_range = None
                 elif len(length_reduction_options) == 2:
-                    self.var_segm_strict_reduction = None
+                    self.hier_segm_strict_reduction = None
                     assert length_reduction_options[0] <= length_reduction_options[1]
-                    self.var_segm_reduction_range = tuple(length_reduction_options)
+                    self.hier_segm_reduction_range = tuple(length_reduction_options)
                 else:
                     assert False
             else:
@@ -656,6 +658,8 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False, id=
                         # - TODO would otherwise need to change unmasked_features = features.clone() to be after projection instead of before
 
         if self.segm:
+            if self.hier_segm_guess_orig:
+                unmasked_features = features.clone()  # if guessing original features, get them before averaging
             features, padding_mask, segment_borders = self.segmentation(features, padding_mask, 5)  
             # [!] minSegmsPerLine needs to be at least a few so that part with masking with at least 2 masks works correctly
 
@@ -672,10 +676,13 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False, id=
                         # [!] logging here, before projection as this is what representation segmentation uses 
                         # - TODO would otherwise need to change unmasked_features = features.clone() to be after projection instead of before
 
-        unmasked_features = features.clone()
+        if not self.hier_segm_guess_orig:
+            unmasked_features = features.clone()
+
+        assert(unmasked_features is not None)
 
         # doing it here as needed to clone features after segmentation and clone was before post_extract_proj 
-        # - [!] TODO check if this (cloning before post_extract_proj) is intended, looks very weird to me
+        # - [!] TODO maybe check if this (cloning before post_extract_proj) is intended, but perhaps not a very big difference (only linear projection)
         if self.post_extract_proj is not None:
             features = self.post_extract_proj(features)
 
@@ -775,14 +782,14 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False, id=
     def segmentation(self, features, padding_mask, minSegmsPerLine):
         assert self.segm == 'var'  # for now only that supported, to be extended
         non_padded = padding_mask.numel() - padding_mask.sum().item()
-        if self.var_segm_strict_reduction is not None:
-            base_len_sum = int(round(non_padded / self.var_segm_strict_reduction))
-            return HierarchicalVarianceSegmentationLayer.apply(features, padding_mask, base_len_sum, None, minSegmsPerLine, self.var_segm_merge_priority)
+        if self.hier_segm_strict_reduction is not None:
+            base_len_sum = int(round(non_padded / self.hier_segm_strict_reduction))
+            return HierarchicalSegmentationLayer.apply(features, padding_mask, base_len_sum, None, minSegmsPerLine, self.hier_segm_merge_priority, self.hier_segm_shortening_policy)
         else:
-            min_reduction, max_reduction = self.var_segm_reduction_range
+            min_reduction, max_reduction = self.hier_segm_reduction_range
             min_segm = base_len_sum = int(round(non_padded / max_reduction))  #max(features.shape[0], int(round(0.85*base_len_sum)))
             max_segm = base_len_sum = int(round(non_padded / min_reduction))  #min(non_padded, int(round(1.15*base_len_sum)))
-            return HierarchicalVarianceSegmentationLayer.apply(features, padding_mask, None, (min_segm, max_segm), minSegmsPerLine, self.var_segm_merge_priority)
+            return HierarchicalSegmentationLayer.apply(features, padding_mask, None, (min_segm, max_segm), minSegmsPerLine, self.hier_segm_merge_priority)
 
     def log_segmented_image(self, img, borders, name=None, convert_numbers_from_01=True):
         converted_grayscale_img = img*255. if convert_numbers_from_01 else img
diff --git a/fairseq/modules/__init__.py b/fairseq/modules/__init__.py
index 91fcbd3653..fbcfb4ead3 100644
--- a/fairseq/modules/__init__.py
+++ b/fairseq/modules/__init__.py
@@ -35,7 +35,7 @@
 from .unfold import unfold1d
 from .transformer_layer import TransformerDecoderLayer, TransformerEncoderLayer
 from .vggblock import VGGBlock
-from .segmentation.hierarchical_variance_segmentation import hierarchicalVarianceSegmentation, HierarchicalVarianceSegmentationLayer
+from .segmentation.hierarchical_segmentation import hierarchicalSegmentation, HierarchicalSegmentationLayer
 
 __all__ = [
     "AdaptiveInput",
@@ -73,7 +73,7 @@
     "TransformerEncoderLayer",
     "TransposeLast",
     "VGGBlock",
-    "hierarchicalVarianceSegmentation",
-    "HierarchicalVarianceSegmentationLayer",
+    "hierarchicalSegmentation",
+    "HierarchicalSegmentationLayer",
     "unfold1d",
 ]
diff --git a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py b/fairseq/modules/segmentation/hierarchical_segmentation.py
similarity index 84%
rename from fairseq/modules/segmentation/hierarchical_variance_segmentation.py
rename to fairseq/modules/segmentation/hierarchical_segmentation.py
index 466788ff5e..631c65e472 100644
--- a/fairseq/modules/segmentation/hierarchical_variance_segmentation.py
+++ b/fairseq/modules/segmentation/hierarchical_segmentation.py
@@ -29,7 +29,7 @@ def cosDist(linearSum1, squaresSum1, size1, linearSum2, squaresSum2, size2):  #
     # but the other one would be more expensive to compute
 
 # [!] lines has to be a numpy array, np.sum() crashes if done on tensor
-def hierarchicalVarianceSegmentation(lines, padMask=None, k=None, minSegmsPerLine=None, mergePriority="mse"):  # k is sum of number of segments for all lines
+def hierarchicalSegmentation(lines, padMask=None, k=None, minSegmsPerLine=None, mergePriority="mse"):  # k is sum of number of segments for all lines
     
     if mergePriority == "se":  # var not divided by size, square error
         costFun = seDiff  #lambda linearSum1, squaresSum1, size1, linearSum2, squaresSum2, size2: seDiff(linearSum1, squaresSum1, size1, linearSum2, squaresSum2, size2)
@@ -105,7 +105,7 @@ def hierarchicalVarianceSegmentation(lines, padMask=None, k=None, minSegmsPerLin
             
     return varChanges, merges, segmentsDict
 
-class HierarchicalVarianceSegmentationLayer(Function):
+class HierarchicalSegmentationLayer(Function):
 
     @staticmethod
     def flatten(x):
@@ -119,11 +119,12 @@ def flatten(x):
     # perhaps that ^ is not needed, and restore_shapes also
 
     @staticmethod
-    def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPerLine=None, mergePriority="mse"): 
+    def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPerLine=None, mergePriority="se", shorteningPolicy="orig_len"): 
     # k for strict num of segments (SUM FOR ALL LINES), allowKsumRange for range OF SUM OF SEGMENTS IN ALL LINES and choosing 'best' split point
     # min and max number of merges adjusted to what is possible - e.g. because of minSegmsPerLine
 
         assert k is None or allowKsumRange is None  # mutually exclusive options
+        assert shorteningPolicy in ("shorten", "orig_len")  # orig_len&guess_orig is only at the higher level
 
         # TODO if input only 2-dim, add another dimension possibly (W x H -> 1 x W x H, consistent with B x W x H - later assuming that in some places)
 
@@ -137,7 +138,7 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPe
         # https://discuss.pytorch.org/t/cant-convert-cuda-tensor-to-numpy-use-tensor-cpu-to-copy-the-tensor-to-host-memory-first/38301 ,
         # https://discuss.pytorch.org/t/what-is-the-cpu-in-pytorch/15007/3
 
-        varChanges, merges, segmentsDict = hierarchicalVarianceSegmentation(input, padMask=padMask, k=k, minSegmsPerLine=minSegmsPerLine, mergePriority=mergePriority)  # won't modify input
+        varChanges, merges, segmentsDict = hierarchicalSegmentation(input, padMask=padMask, k=k, minSegmsPerLine=minSegmsPerLine, mergePriority=mergePriority, shorteningPolicy=shorteningPolicy)  # won't modify input
         #print("MERGES0: ", merges)
         if allowKsumRange:  # full merge done above, k=None, so each line now has minSegmsPerLine, but can also just get it from SegmDict - cleaner
             begin, end = allowKsumRange
@@ -186,7 +187,10 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPe
         segmentBorders = np.zeros((input.shape[0], input.shape[1]), dtype=np.int8)
         for line, idxInLine in finalSegments.keys():
             line, begin, end = finalSegments[(line, idxInLine)]
-            segmented[line][idxInLine] = np.mean(input[line][begin:(end+1)], axis=0)  #torch.mean(input[line][begin:(end+1)])
+            if shorteningPolicy == "shorten":
+                segmented[line][idxInLine] = np.mean(input[line][begin:(end+1)], axis=0)  #torch.mean(input[line][begin:(end+1)])
+            else:
+                segmented[line][begin:(end+1)] = np.mean(input[line][begin:(end+1)], axis=0)
             segmentBorders[line][end] = -1  
             segmentBorders[line][begin] = 1  # [!] can be e.g. [...0, 0, 1, 1, ...] with segment of length 1 
             # - marking begins when length 1 as * scaling doesn't need + (scale-1) there if logging only begins
@@ -198,6 +202,10 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPe
         #print("********************", dir(ctx))
         #[not really needed] ctx.save_for_backward(padMask, resPadMask)
         # save_for_backward is only for tensors / variables / stuff
+        if shorteningPolicy == "shorten":
+            ctx.shortened = True
+        else:
+            ctx.shortened = False
         ctx.finalSegments = finalSegments
         ctx.segmentNumsInLines = segmentNumsInLines
         ctx.inputShape = input.shape
@@ -215,13 +223,18 @@ def backward(ctx, dxThrough, outPadMask=None, segmentBorders=None):  #, finalSeg
         #[not really needed] paddingMask, paddingMaskOut = ctx.saved_tensors
         dx = torch.empty(size=ctx.inputShape, dtype=dxThrough.dtype).fill_(0.).to('cpu')
 
+        wasShortened = ctx.shortened
+
         for line, idxInLine in ctx.finalSegments.keys():
             line, begin, end = ctx.finalSegments[(line, idxInLine)]
-            dx[line][begin:(end+1)] = dxThrough[line][idxInLine] / (end - begin + 1)
+            if wasShortened:
+                dx[line][begin:(end+1)] = dxThrough[line][idxInLine] / (end - begin + 1)
+            else:
+                dx[line][begin:(end+1)] = (dxThrough[line][begin:(end+1)].sum(dim=0)) / (end - begin + 1)
 
         dx = dx.to(dxThroughDevice)
 
-        return dx, None, None, None, None, None
+        return dx, None, None, None, None, None, None
 
 
 if __name__ == '__main__':
@@ -234,12 +247,12 @@ def backward(ctx, dxThrough, outPadMask=None, segmentBorders=None):  #, finalSeg
 
     tensor = torch.tensor([[[1,2],[1,2],[3,4],[3,4],[3,4],[8,9],[8,9]], [[1,2],[1,2],[3,4],[3,4],[3,4],[8,9],[8,9]]], dtype=torch.float64).requires_grad_(True)
     print(tensor[0][1])
-    print(hierarchicalVarianceSegmentation(tensor.detach().numpy(), padMask=None, k=4, minSegmsPerLine=None, mergePriority="se"))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
-    print(hierarchicalVarianceSegmentation(tensor.detach().numpy(), padMask=None, k=2, minSegmsPerLine=None, mergePriority="var"))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
+    print(hierarchicalSegmentation(tensor.detach().numpy(), padMask=None, k=4, minSegmsPerLine=None, mergePriority="se"))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
+    print(hierarchicalSegmentation(tensor.detach().numpy(), padMask=None, k=2, minSegmsPerLine=None, mergePriority="var"))  # pre-last merge in each line (merging (0,1) and (2,4)) should be 1.92 if summing 'variance vectors'
 
     print("-------------------------- torch ---------------------------")
     # (tensor, padMask, k, kSumRange)
-    resOutput, resPadMask, borders = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), None, (2,5), None, "var")  #(2, 5))  # can;t have keyword args for torch Functions...
+    resOutput, resPadMask, borders = HierarchicalSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), None, (2,5), None, "var", "shorten")  #(2, 5))  # can;t have keyword args for torch Functions...
     print(resOutput)
     print(resPadMask)
     print(borders)
@@ -252,7 +265,7 @@ def backward(ctx, dxThrough, outPadMask=None, segmentBorders=None):  #, finalSeg
     print("-------------------------- torch2 ---------------------------")
     # (tensor, padMask, k, kSumRange)
     tensor.grad.data.zero_()
-    resOutput, resPadMask, borders = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, None, "se")  #(2, 5))  # can;t have keyword args for torch Functions...
+    resOutput, resPadMask, borders = HierarchicalSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, None, "se", "shorten")  #(2, 5))  # can;t have keyword args for torch Functions...
     print(resOutput)
     print(resPadMask)
     print(borders)
@@ -265,7 +278,7 @@ def backward(ctx, dxThrough, outPadMask=None, segmentBorders=None):  #, finalSeg
     print("-------------------------- torch3 ---------------------------")
     # (tensor, padMask, k, kSumRange)
     tensor.grad.data.zero_()
-    resOutput, resPadMask, borders = HierarchicalVarianceSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, 2, "se")  #(2, 5))  # can;t have keyword args for torch Functions...
+    resOutput, resPadMask, borders = HierarchicalSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, 2, "se", "shorten")  #(2, 5))  # can;t have keyword args for torch Functions...
     print(resOutput)
     print(resPadMask)
     print(borders)
diff --git a/uwr_related/test_cmd_scribble.sh b/uwr_related/test_cmd_scribble.sh
index 70a3ea3bd3..2d418fdae4 100755
--- a/uwr_related/test_cmd_scribble.sh
+++ b/uwr_related/test_cmd_scribble.sh
@@ -58,5 +58,5 @@ python train.py --distributed-world-size 1 --update-freq 2 \
   --skip-invalid-size-inputs-valid-test --ddp-backend no_c10d \
   --labels `#can be removed for no labels` \
   `#--segm-log-dir ../imgs3 --repr-data-log-dir ../repr3 --random-log-freq 0.0001 --log-ids =:715,%:1000:123` \
-  `#--segm var:se:2.5-3.5  # optional segmentation` \ 
+  `#--segm hier:se:shorten:2.5-3.5  # optional segmentation` \ 
   --enable-padding # crashes without that, needs to make all lines same-size
\ No newline at end of file

From 0087112960ff768f7744f39da8c25137208c96d3 Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Mon, 4 Jan 2021 21:41:14 +0100
Subject: [PATCH 21/22] fixing bugs from last 2 commits

---
 .../models/wav2vec/wav2vec2_scribblelens.py   |  6 ++--
 .../segmentation/hierarchical_segmentation.py | 31 ++++++++++++++-----
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/fairseq/models/wav2vec/wav2vec2_scribblelens.py b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
index 34c3fe89c8..1ba7c4b333 100644
--- a/fairseq/models/wav2vec/wav2vec2_scribblelens.py
+++ b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
@@ -307,7 +307,7 @@ def add_args(parser):
                 + "also contains options, e.g. for var format is hier:<segment_cost>:<shortening_mode>:<batchavg_segment_nr_per_line>, where:\n" \
                 + " i) <segment_cost> is se (squared error), var (variance, se div by length), cos (cosine similarity mapped linearly to distance metric and scaled with segment length) \n" \
                 + " ii) <shortening_mode> is one of: shorten (averages in segments and replaces each with length 1), orig_len (replace with mean in segments, but keep length), " \
-                + "orig_len&guess_orig (as in orig_len, but use original not-averaged representations as masked ones to guess correct one from)\n" \
+                + "orig_len+guess_orig (as in orig_len, but use original not-averaged representations as masked ones to guess correct one from)\n" \
                 + " iii) <batchavg_segment_nr_reduction_per_line> is/are float/floats of format <avg_reduction> or <min_avg_reduction>-<max_avg_reduction>\n"
         )  # TODO add option for reconstruction/rounding loss; maybe also think about an option with ~constant length reduction (but at least one piece each segment) so that 
            #      the long segments are not complete random averaged stuff
@@ -461,10 +461,10 @@ def __init__(self, args):
                 self.segm = "var"
                 assert len(segm_opts) == 4
                 self.hier_segm_merge_priority = segm_opts[1]
-                shorten_opts = segm_opts[2].split("&")
+                shorten_opts = segm_opts[2].split("+")
                 self.hier_segm_shortening_policy = shorten_opts[0]
                 self.hier_segm_guess_orig = len(shorten_opts) > 1 and shorten_opts[1] == "guess_orig"
-                length_reduction_options = list(map(float, segm_opts[2].split("-")))
+                length_reduction_options = list(map(float, segm_opts[3].split("-")))
                 if len(length_reduction_options) == 1:
                     self.hier_segm_strict_reduction = length_reduction_options[0]
                     self.hier_segm_reduction_range = None
diff --git a/fairseq/modules/segmentation/hierarchical_segmentation.py b/fairseq/modules/segmentation/hierarchical_segmentation.py
index 631c65e472..c73bb476c8 100644
--- a/fairseq/modules/segmentation/hierarchical_segmentation.py
+++ b/fairseq/modules/segmentation/hierarchical_segmentation.py
@@ -124,7 +124,7 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPe
     # min and max number of merges adjusted to what is possible - e.g. because of minSegmsPerLine
 
         assert k is None or allowKsumRange is None  # mutually exclusive options
-        assert shorteningPolicy in ("shorten", "orig_len")  # orig_len&guess_orig is only at the higher level
+        assert shorteningPolicy in ("shorten", "orig_len")  # orig_len+guess_orig is only at the higher level
 
         # TODO if input only 2-dim, add another dimension possibly (W x H -> 1 x W x H, consistent with B x W x H - later assuming that in some places)
 
@@ -138,7 +138,7 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPe
         # https://discuss.pytorch.org/t/cant-convert-cuda-tensor-to-numpy-use-tensor-cpu-to-copy-the-tensor-to-host-memory-first/38301 ,
         # https://discuss.pytorch.org/t/what-is-the-cpu-in-pytorch/15007/3
 
-        varChanges, merges, segmentsDict = hierarchicalSegmentation(input, padMask=padMask, k=k, minSegmsPerLine=minSegmsPerLine, mergePriority=mergePriority, shorteningPolicy=shorteningPolicy)  # won't modify input
+        varChanges, merges, segmentsDict = hierarchicalSegmentation(input, padMask=padMask, k=k, minSegmsPerLine=minSegmsPerLine, mergePriority=mergePriority)  # won't modify input
         #print("MERGES0: ", merges)
         if allowKsumRange:  # full merge done above, k=None, so each line now has minSegmsPerLine, but can also just get it from SegmDict - cleaner
             begin, end = allowKsumRange
@@ -178,11 +178,16 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPe
         # TODO change from here (and also change backward) depending on shortening policy [!]
 
         maxSegments = max(segmentNumsInLines)
-        paddingMaskOut = np.full((input.shape[0], maxSegments), False)  #torch.BoolTensor(size=(input.shape[0], maxSegments)).fill_(False)
-        for i, n in enumerate(segmentNumsInLines):
-            paddingMaskOut[i][n:] = True
         
-        segmented = np.full((input.shape[0], maxSegments, input.shape[2]), 0.)  #torch.tensor(size=(input.shape[0], maxSegments, input.shape[2])).fill_(0.)
+        if shorteningPolicy == "shorten":
+            segmented = np.full((input.shape[0], maxSegments, input.shape[2]), 0.)  #torch.tensor(size=(input.shape[0], maxSegments, input.shape[2])).fill_(0.)
+            paddingMaskOut = np.full((input.shape[0], maxSegments), False)  #torch.BoolTensor(size=(input.shape[0], maxSegments)).fill_(False)
+            for i, n in enumerate(segmentNumsInLines):
+                paddingMaskOut[i][n:] = True
+            resPadMask = torch.BoolTensor(paddingMaskOut).to(padMaskInputDevice)
+        else:
+            segmented = np.full(input.shape, 0.)
+            resPadMask = padMask
         # can perhaps return a tensor with 1 at the beginning of the segments, -1 at the end, 0s elsewhere
         segmentBorders = np.zeros((input.shape[0], input.shape[1]), dtype=np.int8)
         for line, idxInLine in finalSegments.keys():
@@ -196,7 +201,7 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPe
             # - marking begins when length 1 as * scaling doesn't need + (scale-1) there if logging only begins
 
         resOutput = torch.tensor(segmented, dtype=inputGPU.dtype).to(inputDevice)   #if wasInputOnGPU else torch.tensor(segmented)  #.requires_grad_(True)
-        resPadMask = torch.BoolTensor(paddingMaskOut).to(padMaskInputDevice)   #if wasPadMaskOnGPU else torch.BoolTensor(paddingMaskOut)
+        # resPadMask created above, as for some reason torch.BoolTensor(paddingMaskOut).to(padMaskInputDevice) thrown an error if paddingMaskOut was a tensor on a correct device
         segmentBorders = torch.IntTensor(segmentBorders).to(inputDevice)
 
         #print("********************", dir(ctx))
@@ -284,5 +289,17 @@ def backward(ctx, dxThrough, outPadMask=None, segmentBorders=None):  #, finalSeg
     print(borders)
     # [!] here will return 4 segments instead of specified 3, because of specified minSegmsPerLine
 
+    resOutput.sum().backward()  # .backward() needs loss to be a number (tensor of size (1,))
+    print(tensor.grad)
+
+    print("-------------------------- torch4 ---------------------------")
+    # (tensor, padMask, k, kSumRange)
+    tensor.grad.data.zero_()
+    resOutput, resPadMask, borders = HierarchicalSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, 2, "se", "orig_len")  #(2, 5))  # can;t have keyword args for torch Functions...
+    print(resOutput)
+    print(resPadMask)
+    print(borders)
+    # [!] here will return 4 segments instead of specified 3, because of specified minSegmsPerLine
+
     resOutput.sum().backward()  # .backward() needs loss to be a number (tensor of size (1,))
     print(tensor.grad)
\ No newline at end of file

From 91bf9a1a074a2fde08ec7a7e85805c82ae6cb1a1 Mon Sep 17 00:00:00 2001
From: Piotr Pusz <petropusz@gmail.com>
Date: Fri, 8 Jan 2021 02:06:13 +0100
Subject: [PATCH 22/22] added rounding loss option to segmentation

---
 .../models/wav2vec/wav2vec2_scribblelens.py   | 30 +++++++-----
 .../segmentation/hierarchical_segmentation.py | 48 ++++++++++++++-----
 uwr_related/test_cmd_scribble.sh              |  6 +--
 3 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/fairseq/models/wav2vec/wav2vec2_scribblelens.py b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
index 1ba7c4b333..885610cc2c 100644
--- a/fairseq/models/wav2vec/wav2vec2_scribblelens.py
+++ b/fairseq/models/wav2vec/wav2vec2_scribblelens.py
@@ -304,12 +304,14 @@ def add_args(parser):
 
         parser.add_argument(
             "--segm", type=str, help="use segmentation on representations; 'hier' (without ') for hierarchical segm; " \
-                + "also contains options, e.g. for var format is hier:<segment_cost>:<shortening_mode>:<batchavg_segment_nr_per_line>, where:\n" \
+                + "also contains options, e.g. for var format is hier:<segment_cost>:<rounding_loss>:<shortening_mode>:<batchavg_segment_nr_per_line>, where:\n" \
                 + " i) <segment_cost> is se (squared error), var (variance, se div by length), cos (cosine similarity mapped linearly to distance metric and scaled with segment length) \n" \
-                + " ii) <shortening_mode> is one of: shorten (averages in segments and replaces each with length 1), orig_len (replace with mean in segments, but keep length), " \
+                + " ii) <rounding_loss> is additional rounding loss to use (se, var, lin, cos, or none) - measuring distance of average given for segment from original representations; " \
+                + "need to add weight for this loss in loss-weights param if not none\n" \
+                + " iii) <shortening_mode> is one of: shorten (averages in segments and replaces each with length 1), orig_len (replace with mean in segments, but keep length), " \
                 + "orig_len+guess_orig (as in orig_len, but use original not-averaged representations as masked ones to guess correct one from)\n" \
-                + " iii) <batchavg_segment_nr_reduction_per_line> is/are float/floats of format <avg_reduction> or <min_avg_reduction>-<max_avg_reduction>\n"
-        )  # TODO add option for reconstruction/rounding loss; maybe also think about an option with ~constant length reduction (but at least one piece each segment) so that 
+                + " iv) <batchavg_segment_nr_reduction_per_line> is/are float/floats of format <avg_reduction> or <min_avg_reduction>-<max_avg_reduction>\n"
+        )  # TODO maybe also think about an option with ~constant length reduction (but at least one piece each segment) so that 
            #      the long segments are not complete random averaged stuff
 
         parser.add_argument(
@@ -459,12 +461,13 @@ def __init__(self, args):
             # this part needs to set stuff needed by 'segmentation' method
             if segm_opts[0] == "hier":
                 self.segm = "var"
-                assert len(segm_opts) == 4
+                assert len(segm_opts) == 5
                 self.hier_segm_merge_priority = segm_opts[1]
-                shorten_opts = segm_opts[2].split("+")
+                self.hier_segm_rounding_loss = segm_opts[2] if segm_opts[2] != "none" else None
+                shorten_opts = segm_opts[3].split("+")
                 self.hier_segm_shortening_policy = shorten_opts[0]
                 self.hier_segm_guess_orig = len(shorten_opts) > 1 and shorten_opts[1] == "guess_orig"
-                length_reduction_options = list(map(float, segm_opts[3].split("-")))
+                length_reduction_options = list(map(float, segm_opts[4].split("-")))
                 if len(length_reduction_options) == 1:
                     self.hier_segm_strict_reduction = length_reduction_options[0]
                     self.hier_segm_reduction_range = None
@@ -660,7 +663,7 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False, id=
         if self.segm:
             if self.hier_segm_guess_orig:
                 unmasked_features = features.clone()  # if guessing original features, get them before averaging
-            features, padding_mask, segment_borders = self.segmentation(features, padding_mask, 5)  
+            features, padding_mask, segment_borders, rounding_loss = self.segmentation(features, padding_mask, 5)
             # [!] minSegmsPerLine needs to be at least a few so that part with masking with at least 2 masks works correctly
 
         if self.need_logging:
@@ -676,7 +679,7 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False, id=
                         # [!] logging here, before projection as this is what representation segmentation uses 
                         # - TODO would otherwise need to change unmasked_features = features.clone() to be after projection instead of before
 
-        if not self.hier_segm_guess_orig:
+        if not self.segm or not self.hier_segm_guess_orig:
             unmasked_features = features.clone()
 
         assert(unmasked_features is not None)
@@ -776,6 +779,8 @@ def forward(self, source, padding_mask=None, mask=True, features_only=False, id=
             result["code_perplexity"] = code_ppl
             result["num_vars"] = num_vars
             result["temp"] = curr_temp
+        if self.segm and self.hier_segm_rounding_loss is not None:
+            result["rounding_loss"] = rounding_loss
 
         return result
 
@@ -784,12 +789,12 @@ def segmentation(self, features, padding_mask, minSegmsPerLine):
         non_padded = padding_mask.numel() - padding_mask.sum().item()
         if self.hier_segm_strict_reduction is not None:
             base_len_sum = int(round(non_padded / self.hier_segm_strict_reduction))
-            return HierarchicalSegmentationLayer.apply(features, padding_mask, base_len_sum, None, minSegmsPerLine, self.hier_segm_merge_priority, self.hier_segm_shortening_policy)
+            return HierarchicalSegmentationLayer.apply(features, padding_mask, base_len_sum, None, minSegmsPerLine, self.hier_segm_merge_priority, self.hier_segm_shortening_policy, self.hier_segm_rounding_loss)
         else:
             min_reduction, max_reduction = self.hier_segm_reduction_range
             min_segm = base_len_sum = int(round(non_padded / max_reduction))  #max(features.shape[0], int(round(0.85*base_len_sum)))
             max_segm = base_len_sum = int(round(non_padded / min_reduction))  #min(non_padded, int(round(1.15*base_len_sum)))
-            return HierarchicalSegmentationLayer.apply(features, padding_mask, None, (min_segm, max_segm), minSegmsPerLine, self.hier_segm_merge_priority)
+            return HierarchicalSegmentationLayer.apply(features, padding_mask, None, (min_segm, max_segm), minSegmsPerLine, self.hier_segm_merge_priority, self.hier_segm_shortening_policy, self.hier_segm_rounding_loss)
 
     def log_segmented_image(self, img, borders, name=None, convert_numbers_from_01=True):
         converted_grayscale_img = img*255. if convert_numbers_from_01 else img
@@ -871,6 +876,9 @@ def get_extra_losses(self, net_output):
         if "features_pen" in net_output:
             pen.append(net_output["features_pen"])
 
+        if self.segm and self.hier_segm_rounding_loss is not None:
+            pen.append(net_output["rounding_loss"])
+
         return pen
 
     def remove_pretraining_modules(self):
diff --git a/fairseq/modules/segmentation/hierarchical_segmentation.py b/fairseq/modules/segmentation/hierarchical_segmentation.py
index c73bb476c8..57ed6bbd8e 100644
--- a/fairseq/modules/segmentation/hierarchical_segmentation.py
+++ b/fairseq/modules/segmentation/hierarchical_segmentation.py
@@ -28,6 +28,20 @@ def cosDist(linearSum1, squaresSum1, size1, linearSum2, squaresSum2, size2):  #
     # but that's perhaps not exactly this sum as cosine_similarity ( (sum_i a_i) , x ) is not the same as (sum_i cosine_similarity ( a_i , x )) )
     # but the other one would be more expensive to compute
 
+def linRoundingLoss(mean, originals):
+    return torch.abs(originals - mean).sum()
+
+def varRoundingLoss(mean, originals):
+    return torch.mean(torch.square(originals - mean), dim=0).sum()
+
+def seRoundingLoss(mean, originals):
+    return torch.square(originals - mean).sum()
+
+def cosRoundingLoss(mean, originals):
+    unscaledSim = torch.matmul(mean, originals) / (torch.sqrt(torch.dot(mean, mean)) * torch.sqrt(torch.matmul(originals, originals)))
+    unscaledAsDist = -unscaledSim + 1.
+    return unscaledAsDist.sum()
+
 # [!] lines has to be a numpy array, np.sum() crashes if done on tensor
 def hierarchicalSegmentation(lines, padMask=None, k=None, minSegmsPerLine=None, mergePriority="mse"):  # k is sum of number of segments for all lines
     
@@ -119,20 +133,29 @@ def flatten(x):
     # perhaps that ^ is not needed, and restore_shapes also
 
     @staticmethod
-    def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPerLine=None, mergePriority="se", shorteningPolicy="orig_len"): 
+    def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPerLine=None, mergePriority="se", shorteningPolicy="orig_len", roundingLossType=None): 
     # k for strict num of segments (SUM FOR ALL LINES), allowKsumRange for range OF SUM OF SEGMENTS IN ALL LINES and choosing 'best' split point
     # min and max number of merges adjusted to what is possible - e.g. because of minSegmsPerLine
 
         assert k is None or allowKsumRange is None  # mutually exclusive options
         assert shorteningPolicy in ("shorten", "orig_len")  # orig_len+guess_orig is only at the higher level
+        assert roundingLossType in ("se", "var", "lin", "cos", None)
+        if roundingLossType == "se":  
+            roundingLossFun = seRoundingLoss  
+        elif roundingLossType == "var":  
+            roundingLossFun = varRoundingLoss 
+        elif roundingLossType == "lin":  
+            roundingLossFun = linRoundingLoss  
+        elif roundingLossType == "cos":
+            roundingLossFun = cosRoundingLoss  
+        else:
+            assert False
 
         # TODO if input only 2-dim, add another dimension possibly (W x H -> 1 x W x H, consistent with B x W x H - later assuming that in some places)
 
         inputDevice = inputGPU.device
         padMaskInputDevice = padMask.device if padMask is not None else False
 
-        # TODO TODO add shortening policy stuff !
-
         # tensor to CPU  (don't really need copy, will just need to put tensors in segmentsDict)
         input = inputGPU.detach().to('cpu').numpy()  
         # https://discuss.pytorch.org/t/cant-convert-cuda-tensor-to-numpy-use-tensor-cpu-to-copy-the-tensor-to-host-memory-first/38301 ,
@@ -175,8 +198,6 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPe
         #print("MERGES: ", merges)
         #print("FINAL SEGMENTS: ", finalSegments)
 
-        # TODO change from here (and also change backward) depending on shortening policy [!]
-
         maxSegments = max(segmentNumsInLines)
         
         if shorteningPolicy == "shorten":
@@ -190,12 +211,14 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPe
             resPadMask = padMask
         # can perhaps return a tensor with 1 at the beginning of the segments, -1 at the end, 0s elsewhere
         segmentBorders = np.zeros((input.shape[0], input.shape[1]), dtype=np.int8)
+        roundingLoss = torch.tensor(0, dtype=torch.float32).requires_grad_(True).to(inputDevice)  # TODO dtype (?)
         for line, idxInLine in finalSegments.keys():
             line, begin, end = finalSegments[(line, idxInLine)]
             if shorteningPolicy == "shorten":
                 segmented[line][idxInLine] = np.mean(input[line][begin:(end+1)], axis=0)  #torch.mean(input[line][begin:(end+1)])
             else:
                 segmented[line][begin:(end+1)] = np.mean(input[line][begin:(end+1)], axis=0)
+            roundingLoss += roundingLossFun(torch.mean(inputGPU[line][begin:(end+1)], dim=0), inputGPU[line][begin:(end+1)])
             segmentBorders[line][end] = -1  
             segmentBorders[line][begin] = 1  # [!] can be e.g. [...0, 0, 1, 1, ...] with segment of length 1 
             # - marking begins when length 1 as * scaling doesn't need + (scale-1) there if logging only begins
@@ -218,10 +241,11 @@ def forward(ctx, inputGPU, padMask=None, k=None, allowKsumRange=None, minSegmsPe
 
         #print("FINAL SEGMENTS: ", finalSegments, segmentNumsInLines)
 
-        return resOutput, resPadMask, segmentBorders  #, finalSegments, segmentNumsInLines can only return torch variables... TODO maybe check how to fetch this info, but not sure if needed
+        # with rounding loss None, will just return 0
+        return resOutput, resPadMask, segmentBorders, roundingLoss  #, finalSegments, segmentNumsInLines can only return torch variables... TODO maybe check how to fetch this info, but not sure if needed
 
     @staticmethod
-    def backward(ctx, dxThrough, outPadMask=None, segmentBorders=None):  #, finalSegments=None, segmentNumsInLines=None):
+    def backward(ctx, dxThrough, outPadMask=None, segmentBorders=None, roundingLoss=None):  #, finalSegments=None, segmentNumsInLines=None):
 
         dxThroughDevice = dxThrough.device
 
@@ -239,7 +263,7 @@ def backward(ctx, dxThrough, outPadMask=None, segmentBorders=None):  #, finalSeg
 
         dx = dx.to(dxThroughDevice)
 
-        return dx, None, None, None, None, None, None
+        return dx, None, None, None, None, None, None, None
 
 
 if __name__ == '__main__':
@@ -257,7 +281,7 @@ def backward(ctx, dxThrough, outPadMask=None, segmentBorders=None):  #, finalSeg
 
     print("-------------------------- torch ---------------------------")
     # (tensor, padMask, k, kSumRange)
-    resOutput, resPadMask, borders = HierarchicalSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), None, (2,5), None, "var", "shorten")  #(2, 5))  # can;t have keyword args for torch Functions...
+    resOutput, resPadMask, borders, roundingLoss = HierarchicalSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), None, (2,5), None, "var", "shorten", None)  #(2, 5))  # can;t have keyword args for torch Functions...
     print(resOutput)
     print(resPadMask)
     print(borders)
@@ -270,7 +294,7 @@ def backward(ctx, dxThrough, outPadMask=None, segmentBorders=None):  #, finalSeg
     print("-------------------------- torch2 ---------------------------")
     # (tensor, padMask, k, kSumRange)
     tensor.grad.data.zero_()
-    resOutput, resPadMask, borders = HierarchicalSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, None, "se", "shorten")  #(2, 5))  # can;t have keyword args for torch Functions...
+    resOutput, resPadMask, borders, roundingLoss = HierarchicalSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, None, "se", "shorten", None)  #(2, 5))  # can;t have keyword args for torch Functions...
     print(resOutput)
     print(resPadMask)
     print(borders)
@@ -283,7 +307,7 @@ def backward(ctx, dxThrough, outPadMask=None, segmentBorders=None):  #, finalSeg
     print("-------------------------- torch3 ---------------------------")
     # (tensor, padMask, k, kSumRange)
     tensor.grad.data.zero_()
-    resOutput, resPadMask, borders = HierarchicalSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, 2, "se", "shorten")  #(2, 5))  # can;t have keyword args for torch Functions...
+    resOutput, resPadMask, borders, roundingLoss = HierarchicalSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, 2, "se", "shorten", None)  #(2, 5))  # can;t have keyword args for torch Functions...
     print(resOutput)
     print(resPadMask)
     print(borders)
@@ -295,7 +319,7 @@ def backward(ctx, dxThrough, outPadMask=None, segmentBorders=None):  #, finalSeg
     print("-------------------------- torch4 ---------------------------")
     # (tensor, padMask, k, kSumRange)
     tensor.grad.data.zero_()
-    resOutput, resPadMask, borders = HierarchicalSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, 2, "se", "orig_len")  #(2, 5))  # can;t have keyword args for torch Functions...
+    resOutput, resPadMask, borders, roundingLoss = HierarchicalSegmentationLayer.apply(tensor, torch.tensor([[True, False, False, False, False, False, False], [False, False, False, False, False, False, True]]), 3, None, 2, "se", "orig_len", None)  #(2, 5))  # can;t have keyword args for torch Functions...
     print(resOutput)
     print(resPadMask)
     print(borders)
diff --git a/uwr_related/test_cmd_scribble.sh b/uwr_related/test_cmd_scribble.sh
index 2d418fdae4..316218e282 100755
--- a/uwr_related/test_cmd_scribble.sh
+++ b/uwr_related/test_cmd_scribble.sh
@@ -48,15 +48,15 @@ python train.py --distributed-world-size 1 --update-freq 2 \
   --final-dim 256 \
   --latent-vars 320 --latent-groups 2 --latent-temp '(2,0.5,0.999995)' --infonce \
   --optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-06 --lr-scheduler polynomial_decay \
-  --total-num-update 400000 --lr 0.0005 --warmup-updates 32000 \
+  --total-num-update 40000 --lr 0.0005 --warmup-updates 3000 `#32000 is too much for scribblelens, more than twice as far as it collapses, same 400000 updates` \
   --mask-length 10 --mask-prob 0.65 --mask-selection static --mask-other 0 \
   --encoder-layerdrop 0.05 --dropout-input 0.1 --dropout-features 0.1 --feature-grad-mult 0.1 \
-  --loss-weights '[0.1, 10]' --conv-pos 128 --conv-pos-groups 16 \
+  --loss-weights '[0.1, 10]' `#'[0.1, 10, 3]'` --conv-pos 128 --conv-pos-groups 16 \
   --num-negatives 100 --cross-sample-negatives 0 \
   `#--max-sample-size 250000 --min-sample-size 32000` \
   --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 --max-tokens 10000 --max-update 400000 \
   --skip-invalid-size-inputs-valid-test --ddp-backend no_c10d \
   --labels `#can be removed for no labels` \
   `#--segm-log-dir ../imgs3 --repr-data-log-dir ../repr3 --random-log-freq 0.0001 --log-ids =:715,%:1000:123` \
-  `#--segm hier:se:shorten:2.5-3.5  # optional segmentation` \ 
+  `#--segm hier:se:none:shorten:2.5-3.5  # optional segmentation` \ 
   --enable-padding # crashes without that, needs to make all lines same-size
\ No newline at end of file