Skip to content

Commit 674fdf3

Browse files
committed
Fix line mapping being incorrect.
Until now, a wrong line mapping would be computed when adding text e.g. to the end of a line. This is because in the `git diff --word-diff=porcelain` that was used so far, such changes appear as a "+" line (which makes sense given that it's a word- diff), which the code so far interpreted as the addition of a new line when in fact only words inside the line were changed. So far, I believe that a correct line mapping cannot be computed using `--word-diff`, because it inherently ignores whitespace: If you delete whitespace at the end of the line, and also some following empty lines, it doesn't even appear in `git diff --word-diff=porcelain`. This commit uses `diff` instead, which has a feature to tell us the added, deleted and unchanged lines, from which we can directly construct the line mapping.
1 parent 5c93039 commit 674fdf3

File tree

1 file changed

+63
-106
lines changed

1 file changed

+63
-106
lines changed

gitbrowse/git.py

Lines changed: 63 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import subprocess
23

34

45
class GitCommit(object):
@@ -158,119 +159,75 @@ def _build_line_mappings(self, start, finish):
158159
forward = {}
159160
backward = {}
160161

161-
# Get information about blank lines: The git diff porcelain format
162-
# (which we use for everything else) doesn't distinguish between
163-
# additions and removals, so this is a very dirty hack to get around
164-
# the problem.
165-
p = os.popen('git diff %s %s -- %s | grep -E "^[+-]$"' % (
166-
start,
167-
finish,
168-
self.path,
169-
))
170-
blank_lines = [l.strip() for l in p.readlines()]
171-
172-
p = os.popen('git diff --word-diff=porcelain %s %s -- %s' % (
173-
start,
174-
finish,
175-
self.path,
176-
))
177-
178-
# The diff output is in sections: A header line (indicating the
179-
# range of lines this section covers) and then a number of
180-
# content lines.
181-
182-
sections = []
183-
184-
# Skip initial headers: They don't interest us.
185-
line = p.readline()
186-
while line != '' and not line.startswith('@@'):
187-
line = p.readline()
188-
189-
while line:
190-
header_line = line
191-
content_lines = []
192-
193-
line = p.readline()
194-
while line and not line.startswith('@@'):
195-
content_lines.append(line)
196-
line = p.readline()
197-
198-
sections.append((header_line, content_lines, ))
199-
200-
201-
start_ln = finish_ln = 0
202-
for header_line, content_lines in sections:
203-
# The headers line has the format '@@ +a,b -c,d @@[ e]' where
204-
# a is the first line number shown from start and b is the
205-
# number of lines shown from start, and c is the first line
206-
# number show from finish and d is the number of lines show
207-
# from from finish, and e is Git's guess at the name of the
208-
# context (and is not always present)
209-
210-
headers = header_line.strip('@ \n').split(' ')
211-
headers = map(lambda x: x.strip('+-').split(','), headers)
212-
213-
start_range = map(int, headers[0])
214-
finish_range = map(int, headers[1])
215-
216-
while start_ln < start_range[0] - 1 and \
217-
finish_ln < finish_range[0] - 1:
218-
forward[start_ln] = finish_ln
219-
backward[finish_ln] = start_ln
220-
start_ln += 1
221-
finish_ln += 1
222-
223-
# Now we're into the diff itself. Individual lines of input
224-
# are separated by a line containing only a '~', this helps
225-
# to distinguish between an addition, a removal, and a change.
226-
227-
line_iter = iter(content_lines)
228-
try:
229-
while True:
230-
group_size = -1
231-
line_delta = 0
232-
line = ' '
233-
while line != '~':
234-
if line.startswith('+'):
235-
line_delta += 1
236-
elif line.startswith('-'):
237-
line_delta -= 1
238-
239-
group_size += 1
240-
line = line_iter.next().rstrip()
241-
242-
if group_size == 0:
243-
# Two '~' lines next to each other means a blank
244-
# line has been either added or removed. Git
245-
# doesn't tell us which. This is all crazy.
246-
if blank_lines.pop(0) == '+':
247-
line_delta += 1
248-
else:
249-
line_delta -= 1
250-
251-
if line_delta == 1:
252-
backward[finish_ln] = None
253-
finish_ln += 1
254-
elif line_delta == -1:
255-
forward[start_ln] = None
256-
start_ln += 1
257-
else:
258-
forward[start_ln] = finish_ln
259-
backward[finish_ln] = start_ln
260-
start_ln += 1
261-
finish_ln += 1
262-
except StopIteration:
263-
pass
264-
265-
# Make sure the mappings stretch the the beginning and end of
266-
# the files.
162+
# We use `diff` to track blocks of added, deleted and unchanged lines
163+
# in order to build the line mapping.
164+
# Its `--old/new/unchanged-group-format` flags make this very easy;
165+
# it generates output like this:
166+
# u 8
167+
# o 3
168+
# n 4
169+
# u 1
170+
# for a diff in which the first 8 lines are unchanged, then 3 deleted,
171+
# then 4 added and then 1 unchanged.
172+
# Below, we parse this output.
173+
#
174+
# In order to get the file contents of the two commits into `diff`,
175+
# we use the equivalent of bash's /dev/fd/N based process subsititution,
176+
# which would look like this:
177+
# diff <(git show commit1:file) <(git show commit2:file)
178+
# (this works on all platforms where bash process substitution works).
179+
180+
p_start = os.popen('git show %s:%s' % (start, self.path))
181+
p_finish = os.popen('git show %s:%s' % (finish, self.path))
182+
183+
p_diff = subprocess.Popen([
184+
'diff',
185+
'/dev/fd/' + str(p_start.fileno()),
186+
'/dev/fd/' + str(p_finish.fileno()),
187+
'--old-group-format=o %dn\n', # lower case n for old file
188+
'--new-group-format=n %dN\n', # upper case N for new file
189+
'--unchanged-group-format=u %dN\n', # for unchanged it doesn't matter if n or N
190+
], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
191+
192+
(out, err) = p_diff.communicate()
193+
assert err == ''
194+
195+
# Unfortunately, splitting the empty string in Python still gives us a singleton
196+
# empty line (`''.split('\n') == ['']`), so we handle that case here.
197+
diff_lines = [] if out == '' else out.strip().split('\n')
198+
199+
start_ln = 0
200+
finish_ln = 0
201+
202+
for line in diff_lines:
203+
assert len(line) >= 3
204+
# Parse the output created with `diff` above.
205+
typ, num_lines_str = line.split(' ')
206+
num_lines = int(num_lines_str)
207+
208+
if typ == 'u': # unchanged lines, advance both sides
209+
for i in range(num_lines):
210+
forward[start_ln] = finish_ln
211+
backward[finish_ln] = start_ln
212+
start_ln += 1
213+
finish_ln += 1
214+
elif typ == 'o': # old/deleted lines, advance left side as they only exist there
215+
for i in range(num_lines):
216+
forward[start_ln] = None
217+
start_ln += 1
218+
elif typ == 'n': # new/added lines, advance right side as they only exist there
219+
for i in range(num_lines):
220+
backward[finish_ln] = None
221+
finish_ln += 1
267222

268223
p = os.popen('git show %s:%s' % (start, self.path))
269224
start_len = len(p.readlines())
270225

271226
p = os.popen('git show %s:%s' % (finish, self.path))
272227
finish_len = len(p.readlines())
273228

229+
# Make sure the mappings stretch the the beginning and end of
230+
# the files.
274231
while start_ln <= start_len and finish_ln <= finish_len:
275232
forward[start_ln] = finish_ln
276233
backward[finish_ln] = start_ln

0 commit comments

Comments
 (0)