Skip to content

Commit 3ee291c

Browse files
committed
Store raw path bytes in Diff instances
Previously, the following fields on Diff instances were assumed to be passed in as unicode strings: - `a_path` - `b_path` - `rename_from` - `rename_to` However, since Git natively records paths as bytes, these may potentially not have a valid unicode representation. This patch changes the Diff instance to instead take the following equivalent fields that should be raw bytes instead: - `a_rawpath` - `b_rawpath` - `raw_rename_from` - `raw_rename_to` NOTE ON BACKWARD COMPATIBILITY: The original `a_path`, `b_path`, etc. fields are still available as properties (rather than slots). These properties now dynamically decode the raw bytes into a unicode string (performing the potentially destructive operation of replacing invalid unicode chars by "�"'s). This means that all code using Diffs should remain backward compatible. The only exception is when people would manually construct Diff instances by calling the constructor directly, in which case they should now pass in bytes rather than unicode strings. See also the discussion on #467
1 parent 105a8c0 commit 3ee291c

File tree

4 files changed

+51
-18
lines changed

4 files changed

+51
-18
lines changed

doc/source/changes.rst

+3
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ Changelog
55
2.0.6 - Fixes and Features
66
==========================
77

8+
* API: Diffs now have `a_rawpath`, `b_rawpath`, `raw_rename_from`,
9+
`raw_rename_to` properties, which are the raw-bytes equivalents of their
10+
unicode path counterparts.
811
* Fix: TypeError about passing keyword argument to string decode() on
912
Python 2.6.
1013
* Feature: `setUrl API on Remotes <https://github.com/gitpython-developers/GitPython/pull/446#issuecomment-224670539>`_

git/compat.py

+2
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def mviter(d):
3535
return d.values()
3636
range = xrange
3737
unicode = str
38+
binary_type = bytes
3839
else:
3940
FileType = file
4041
# usually, this is just ascii, which might not enough for our encoding needs
@@ -44,6 +45,7 @@ def mviter(d):
4445
byte_ord = ord
4546
bchr = chr
4647
unicode = unicode
48+
binary_type = str
4749
range = xrange
4850
def mviter(d):
4951
return d.itervalues()

git/diff.py

+41-17
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from gitdb.util import hex_to_bin
99

10+
from .compat import binary_type
1011
from .objects.blob import Blob
1112
from .objects.util import mode_str_to_int
1213

@@ -245,18 +246,20 @@ class Diff(object):
245246
NULL_HEX_SHA = "0" * 40
246247
NULL_BIN_SHA = b"\0" * 20
247248

248-
__slots__ = ("a_blob", "b_blob", "a_mode", "b_mode", "a_path", "b_path",
249-
"new_file", "deleted_file", "rename_from", "rename_to", "diff")
249+
__slots__ = ("a_blob", "b_blob", "a_mode", "b_mode", "a_rawpath", "b_rawpath",
250+
"new_file", "deleted_file", "raw_rename_from", "raw_rename_to", "diff")
250251

251-
def __init__(self, repo, a_path, b_path, a_blob_id, b_blob_id, a_mode,
252-
b_mode, new_file, deleted_file, rename_from,
253-
rename_to, diff):
252+
def __init__(self, repo, a_rawpath, b_rawpath, a_blob_id, b_blob_id, a_mode,
253+
b_mode, new_file, deleted_file, raw_rename_from,
254+
raw_rename_to, diff):
254255

255256
self.a_mode = a_mode
256257
self.b_mode = b_mode
257258

258-
self.a_path = a_path
259-
self.b_path = b_path
259+
assert a_rawpath is None or isinstance(a_rawpath, binary_type)
260+
assert b_rawpath is None or isinstance(b_rawpath, binary_type)
261+
self.a_rawpath = a_rawpath
262+
self.b_rawpath = b_rawpath
260263

261264
if self.a_mode:
262265
self.a_mode = mode_str_to_int(self.a_mode)
@@ -266,19 +269,21 @@ def __init__(self, repo, a_path, b_path, a_blob_id, b_blob_id, a_mode,
266269
if a_blob_id is None or a_blob_id == self.NULL_HEX_SHA:
267270
self.a_blob = None
268271
else:
269-
self.a_blob = Blob(repo, hex_to_bin(a_blob_id), mode=self.a_mode, path=a_path)
272+
self.a_blob = Blob(repo, hex_to_bin(a_blob_id), mode=self.a_mode, path=self.a_path)
270273

271274
if b_blob_id is None or b_blob_id == self.NULL_HEX_SHA:
272275
self.b_blob = None
273276
else:
274-
self.b_blob = Blob(repo, hex_to_bin(b_blob_id), mode=self.b_mode, path=b_path)
277+
self.b_blob = Blob(repo, hex_to_bin(b_blob_id), mode=self.b_mode, path=self.b_path)
275278

276279
self.new_file = new_file
277280
self.deleted_file = deleted_file
278281

279282
# be clear and use None instead of empty strings
280-
self.rename_from = rename_from or None
281-
self.rename_to = rename_to or None
283+
assert raw_rename_from is None or isinstance(raw_rename_from, binary_type)
284+
assert raw_rename_to is None or isinstance(raw_rename_to, binary_type)
285+
self.raw_rename_from = raw_rename_from or None
286+
self.raw_rename_to = raw_rename_to or None
282287

283288
self.diff = diff
284289

@@ -344,6 +349,22 @@ def __str__(self):
344349
# end
345350
return res
346351

352+
@property
353+
def a_path(self):
354+
return self.a_rawpath.decode(defenc, 'replace') if self.a_rawpath else None
355+
356+
@property
357+
def b_path(self):
358+
return self.b_rawpath.decode(defenc, 'replace') if self.b_rawpath else None
359+
360+
@property
361+
def rename_from(self):
362+
return self.raw_rename_from.decode(defenc, 'replace') if self.raw_rename_from else None
363+
364+
@property
365+
def rename_to(self):
366+
return self.raw_rename_to.decode(defenc, 'replace') if self.raw_rename_to else None
367+
347368
@property
348369
def renamed(self):
349370
""":returns: True if the blob of our diff has been renamed
@@ -388,6 +409,7 @@ def _index_from_patch_format(cls, repo, stream):
388409
new_file_mode, deleted_file_mode, \
389410
a_blob_id, b_blob_id, b_mode, \
390411
a_path, b_path = header.groups()
412+
391413
new_file, deleted_file = bool(new_file_mode), bool(deleted_file_mode)
392414

393415
a_path = cls._pick_best_path(a_path, rename_from, a_path_fallback)
@@ -404,15 +426,15 @@ def _index_from_patch_format(cls, repo, stream):
404426
a_mode = old_mode or deleted_file_mode or (a_path and (b_mode or new_mode or new_file_mode))
405427
b_mode = b_mode or new_mode or new_file_mode or (b_path and a_mode)
406428
index.append(Diff(repo,
407-
a_path and a_path.decode(defenc, 'replace'),
408-
b_path and b_path.decode(defenc, 'replace'),
429+
a_path,
430+
b_path,
409431
a_blob_id and a_blob_id.decode(defenc),
410432
b_blob_id and b_blob_id.decode(defenc),
411433
a_mode and a_mode.decode(defenc),
412434
b_mode and b_mode.decode(defenc),
413435
new_file, deleted_file,
414-
rename_from and rename_from.decode(defenc, 'replace'),
415-
rename_to and rename_to.decode(defenc, 'replace'),
436+
rename_from,
437+
rename_to,
416438
None))
417439

418440
previous_header = header
@@ -438,8 +460,8 @@ def _index_from_raw_format(cls, repo, stream):
438460
meta, _, path = line[1:].partition('\t')
439461
old_mode, new_mode, a_blob_id, b_blob_id, change_type = meta.split(None, 4)
440462
path = path.strip()
441-
a_path = path
442-
b_path = path
463+
a_path = path.encode(defenc)
464+
b_path = path.encode(defenc)
443465
deleted_file = False
444466
new_file = False
445467
rename_from = None
@@ -455,6 +477,8 @@ def _index_from_raw_format(cls, repo, stream):
455477
new_file = True
456478
elif change_type[0] == 'R': # parses RXXX, where XXX is a confidence value
457479
a_path, b_path = path.split('\t', 1)
480+
a_path = a_path.encode(defenc)
481+
b_path = b_path.encode(defenc)
458482
rename_from, rename_to = a_path, b_path
459483
# END add/remove handling
460484

git/test/test_diff.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ def test_diff_with_rename(self):
9090
assert_true(diff.renamed)
9191
assert_equal(diff.rename_from, u'Jérôme')
9292
assert_equal(diff.rename_to, u'müller')
93+
assert_equal(diff.raw_rename_from, b'J\xc3\xa9r\xc3\xb4me')
94+
assert_equal(diff.raw_rename_to, b'm\xc3\xbcller')
9395
assert isinstance(str(diff), str)
9496

9597
output = StringProcessAdapter(fixture('diff_rename_raw'))
@@ -129,7 +131,7 @@ def test_diff_index_raw_format(self):
129131
output = StringProcessAdapter(fixture('diff_index_raw'))
130132
res = Diff._index_from_raw_format(None, output.stdout)
131133
assert res[0].deleted_file
132-
assert res[0].b_path == ''
134+
assert res[0].b_path is None
133135

134136
def test_diff_initial_commit(self):
135137
initial_commit = self.rorepo.commit('33ebe7acec14b25c5f84f35a664803fcab2f7781')
@@ -162,7 +164,9 @@ def test_diff_unsafe_paths(self):
162164
self.assertEqual(res[7].b_path, u'path/with-question-mark?')
163165
self.assertEqual(res[8].b_path, u'path/¯\\_(ツ)_|¯')
164166
self.assertEqual(res[9].b_path, u'path/💩.txt')
167+
self.assertEqual(res[9].b_rawpath, b'path/\xf0\x9f\x92\xa9.txt')
165168
self.assertEqual(res[10].b_path, u'path/�-invalid-unicode-path.txt')
169+
self.assertEqual(res[10].b_rawpath, b'path/\x80-invalid-unicode-path.txt')
166170

167171
# The "Moves"
168172
# NOTE: The path prefixes a/ and b/ here are legit! We're actually

0 commit comments

Comments
 (0)