-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhtml_diff.py
516 lines (431 loc) · 21.8 KB
/
html_diff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
import re
import tqdm
import json
import sys
from difflib import SequenceMatcher
from subprocess import run,PIPE
import io
from io import StringIO
import argparse
TAG_RE = re.compile(r'<.*?>')
PROTECTED_RE = re.compile(r'<!--.*?-->|<style.*?>.*?</style>|<script.*?>.*?</script>|<head.*?>.*?</head>')
SINGLE_RE = re.compile(r'<[^/<>]*>[^<>]*</[^/<>]*>')
ENCLOSED_RE = re.compile(r'^<[^/<>]*>.*</[^/<>]*>$')
WORD_RE = re.compile(
r'([^ \n\r\t,.&;/#=<>()-]+|(?:[ \n\r\t]| )+|[,.&;/#=<>()-])'
)
FRONT_TAG_RE = re.compile(r'.*<.*?>.*')
BACK_TAG_RE = re.compile(r'.*</*?>.*')
#WS_RE = re.compile(r'^([ \n\r\t]| )+$')
WS_RE = re.compile(r'([ \n\r\t]| )+')
GIT_DIFF_LINE_GETTER = re.compile(r'@@[^@]*@@')
class get_context(Exception):
def __init__(self,message,front_rule,back_rule) -> None:
self.front_rule = front_rule
self.back_rule = back_rule
super().__init__(message)
class log(object):
LOG_LEVEL_QUIET = -1 #no console output
LOG_LEVEL_MINIMAL = 1 #only requested output (will not alert for any errors)
LOG_LEVEL_ERROR = 2 #show errors
LOG_LEVEL_PROGRESS = 3 #progress bars and requested output(ie the resulting file)
LOG_LEVEL_DEBUG = 4 #show all output (will be messy)
INSTANCE = None
def __init__(self,log_level = LOG_LEVEL_DEBUG) -> None:
super().__init__()
self.log_level = log_level
self.progress_bar = None
log.INSTANCE = self
def log(self,message,level):
if(level <= self.log_level):
print(message)
def debug(self,messege):
self.log(message,4)
def error(self,messege):
self.log(messege,2)
def show(self,message):
self.log(message,1)
def start_bar(self,name,total):
if(self.log_level >= 3):
self.progress_bar = tqdm.tqdm(desc=name,total=total)
def add_work(self,addition):
if(self.log_level >= 3):
if(self.progress_bar):
self.progress_bar.total += addition
self.progress_bar.update(0)
def complete_work(self,work):
if(self.log_level >= 3):
if(self.progress_bar):
self.progress_bar.update(work)
def stop_bar(self):
if(self.log_level >= 3):
self.progress_bar.close()
self.progress_bar = None
LIST_ITEM = re.compile(r'</?li[^</>]*>')
class splitting_preferences():
def __init__(self,preference_breaks=[], sub_breaks = [],modify_inside=[], tag_as_text = [], kept_tags = [],escape_rules = [],no_diff=[], sub_rules = None):
self.sub_rules = sub_rules
self.kept_tags_RE = kept_tags
self.pref_breaks_RE = preference_breaks
self.sub_breaks_RE = sub_breaks
self.text_tags_RE = tag_as_text
self.escape_rules = escape_rules
self.modify_inside_RE = modify_inside
self.no_diff_RE = no_diff
def get_subrules(self):
return self.sub_rules
#breaking rules
def prevent_breakup(self,html,pos):#return an RE match for a section that should not be broken apart such as <script> <style> as they can't be split like normal html
return PROTECTED_RE.match(html,pos=pos)
#return None
def preference_breaks(self,html,pos):#return an RE match for a section that should be split this is done after white space and prevent breakup but before the text and general spliting
for k in self.pref_breaks_RE:
if(match := k.match(html,pos)):
return match
return None
def sub_breaks(self,html):#detects it text needs to be broken a second time
for rule,theashold in self.sub_breaks_RE:
if(rule.match(html)):
return theashold
return False
def modify_inside(self,html):
for k in self.modify_inside_RE:
if(k.match(html)):
return True
return False
def treat_tag_as_text(self,item): #treat tags as text
for k in self.text_tags_RE:
if(k.match(item)):
return True
return False
def keep_tag_delete(self,item): #when deleting
for k in self.kept_tags_RE:
if(k.match(item)):
return True
return False
def require_escape(self,text):
#require at least one tag in the text
if(not BACK_TAG_RE.match(text) and not FRONT_TAG_RE.match(text)):
raise get_context("required context",BACK_TAG_RE,FRONT_TAG_RE)
for (rule,front,back) in self.escape_rules:
m = rule.match(text)
m = front.match(text)
m = back.match(text)
if(rule.match(text) ):
if( not front.match(text)):
raise get_context("required conext",front,back)
if(not back.match(text)):
raise get_context("required conext",front,back)
return False
def require_escape_no_raise(self,text):
#require at least one tag in the text
if(not BACK_TAG_RE.match(text) and not FRONT_TAG_RE.match(text)):
raise get_context("required context",BACK_TAG_RE,FRONT_TAG_RE)
for (rule,front,back) in self.escape_rules:
m = rule.match(text)
m = front.match(text)
m = back.match(text)
if(rule.match(text) and not front.match(text) and not back.match(text)):
return (front,back)
def no_diff(self,text):
for rule in self.no_diff_RE:
if(rule.match(text)):
return True
return False
class html_splitter(object):
def __init__(self, html_string, spliting_preferences = splitting_preferences()):
self.html_string = html_string
self.pos = 0
self.end_reached = False
self.splitting_preferences = spliting_preferences
def __iter__(self):
return self
def __next__(self):
if self.end_reached:
raise StopIteration
while match := WS_RE.match(self.html_string,pos=self.pos):#starting with white space (clear up all the white space we can see
self.pos = match.end()
if(match := self.splitting_preferences.prevent_breakup(self.html_string,self.pos)): #don't break protected sections
self.pos = match.end()
return match.group(0)
if match := self.splitting_preferences.preference_breaks(self.html_string,self.pos): #don't break sections that are user defined as special
self.pos = match.end()
return match.group(0)
if match := TAG_RE.match(self.html_string, pos=self.pos): #general break rules
self.pos = match.end()
return match.group(0)
match = TAG_RE.search(self.html_string, pos=self.pos) #are there remaining tags
if not match:
self.end_reached = True
return self.html_string[self.pos:]
val = self.html_string[self.pos:match.start()]
self.pos = match.start()
return val
def next(self):
return self.__next__()
class html_differ(SequenceMatcher):
start_del = "<del>"
stop_del = "</del>"
start_ins = "<ins>"
stop_ins = "</ins>"
def __init__(self, source1,source2,splitting_preferences = splitting_preferences()) -> None:
self.splitting_preferences = splitting_preferences
SequenceMatcher.__init__(self, lambda x: x in [""," ","\t","\n"], source1, source2, False)
def set_seqs(self,a,b,protect_small_tags_a = None, protect_small_tags_b = None):
#split text by tags and and non-tags by words
SequenceMatcher.set_seqs(self,
sum([[k] if k.startswith('<') else WORD_RE.findall(k) for k in html_splitter(a,self.splitting_preferences)],[]),
sum([[k] if k.startswith('<') else WORD_RE.findall(k) for k in html_splitter(b,self.splitting_preferences)],[])
)
def clean_delete(self,del_items):
#return a clean set of delete tags that follow html rules
end = ""
text = []
for item in del_items:
if(self.splitting_preferences.modify_inside(item)):
new_sub_items = sum([[k] if k.startswith('<') else WORD_RE.findall(k) for k in html_splitter(item)],[])
end += self.clean_delete(new_sub_items)
elif(item.startswith('<') and not self.splitting_preferences.treat_tag_as_text(item)):#insert the text wrapped in del tags treat small tags as text
if(not all([WS_RE.match(i) for i in text])): #not all white space
end += html_differ.start_del+"".join(text)+html_differ.stop_del
text = []
if self.splitting_preferences.keep_tag_delete(item):
end += item
else:
text.append(item)
if(text):
end += html_differ.start_del+"".join(text)+html_differ.stop_del
return end
def clean_insert(self,ins_items):
#return a clean set of insert tags that follow html rules
end = ""
text = []
for item in ins_items:
if(self.splitting_preferences.modify_inside(item)):
new_sub_items = sum([[k] if k.startswith('<') else WORD_RE.findall(k) for k in html_splitter(item)],[])
end += self.clean_insert(new_sub_items)
elif(item.startswith('<') and not self.splitting_preferences.treat_tag_as_text(item)):#insert the text wrapped in ins check preferences for
if(not all([WS_RE.match(i) for i in text])): #not all white space
end += html_differ.start_ins+"".join(text)+html_differ.stop_ins
text = []
end += item #keep tags
else:
text.append(item)
if(text and not all([WS_RE.match(i) for i in text])):
end += html_differ.start_ins+"".join(text)+html_differ.stop_ins
return end
def white_space_change(self,origin,modified):
#this is a white space change
#same number of tag and text blocks
#all words are the same
white_space_change = False
if(len(origin)==len(modified)):
white_space_change = True
for o,m in zip(origin,modified):
if all([WS_RE.match(o),WS_RE.match(m)]):#both are whitespace
continue
if(o.startswith('<') and m.startswith('<') and not self.splitting_preferences.treat_tag_as_text(m) and not self.splitting_preferences.treat_tag_as_text(o) and not self.splitting_preferences.modify_inside(m) and not self.splitting_preferences.modify_inside(o)):
continue
if o==m:#are identical
pass
else:
white_space_change = False
break
return white_space_change
def detect_sub_breaks(self,items):
for item in items:
if(self.splitting_preferences.sub_breaks(item)):
return True
return False
def diff_html(self):
"""use the sequence matcher to create the diffed html"""
opcodes = self.get_opcodes()
a = self.a
b = self.b
out = StringIO()
#log.INSTANCE.add_work(len(opcodes))
for tag, start_a, end_a, start_b, end_b in opcodes: #main loop
if(self.splitting_preferences.no_diff("".join(b[start_b:end_b]))):
out.write("".join(b[start_b:end_b]))
continue
if tag == 'equal':
out.write("".join(a[start_a:end_a]))
if tag == 'delete':
out.write(self.clean_delete(a[start_a:end_a]))
if tag == 'insert':
out.write(self.clean_insert(b[start_b:end_b]))
if tag == 'replace':
#if (end_a-start_a) == 1 and (end_b-start_b) != 1 and self.use_preference==True : #seem like the complexity has increased diff this section without quick change
# d=html_differ("".join(a[start_a:end_a]),"".join(b[start_b:end_b]),use_preference = False).diff_html()
# out.write(d)
if(self.splitting_preferences.get_subrules() != None and (self.detect_sub_breaks(a[start_a:end_a]) or self.detect_sub_breaks(b[start_b:end_b]))):
d=html_differ("".join(a[start_a:end_a]),"".join(b[start_b:end_b]),self.splitting_preferences.get_subrules())
lower_html = d.diff_html()
r = d.ratio()
if(d.ratio() > self.splitting_preferences.sub_breaks("".join(a[start_a:end_a])) or d.ratio() > self.splitting_preferences.sub_breaks("".join(b[start_b:end_b]))):
out.write(lower_html)
continue
if(not self.white_space_change(a[start_a:end_a],b[start_b:end_b])):
out.write(self.clean_delete(a[start_a:end_a]))
out.write(self.clean_insert(b[start_b:end_b]))
#print("white space")
else:
#print("replace else")
#out.write(self.clean_delete(a[start_a:end_a]))
out.write(self.clean_insert(b[start_b:end_b]))
#out.write("".join(b[start_b:end_b]))
#log.INSTANCE.complete_work(1)
html = out.getvalue()
out.close()
return html
def git_diff(commit_a,commit_b,file_path,context = 1):
"""function to grab changes and line numbers"""
out = run(['git','--no-pager','diff','--minimal',f'--unified={context}',commit_a,commit_b,'--',file_path],stdout=PIPE).stdout.decode("utf-8")
def clean_line_numbers(line_numbers):
line_pairs = line_numbers.replace('@@','').strip().split()
return {
"start_a":int(line_pairs[0].split(",")[0].replace("-","")),
"length_a":int(line_pairs[0].split(",")[1]),
"start_b":int(line_pairs[1].split(",")[0].replace("+","")),
"length_b":int(line_pairs[1].split(",")[1]),
}
return list(map(clean_line_numbers,GIT_DIFF_LINE_GETTER.findall(out)[::]))
def git_read_file(commit_a,file_path):
"""function to grab file data from past commits"""
out = run(['git','--no-pager','show',f"{commit_a}:{file_path}"],stdout=PIPE).stdout.decode("utf-8")
return out.split("\n")
def process_patch(file_a,file_b,patch,spliting_preferences,min_start_a=0,min_start_b=0):
text_a = "".join(file_a[patch["start_a"]:patch["start_a"]+patch["length_a"]])
text_b = "".join(file_b[patch["start_b"]:patch["start_b"]+patch["length_b"]])
try:
#make sure the context is large enough
if(min_start_a < patch["start_a"] and min_start_b < patch["start_b"]):
spliting_preferences.require_escape(text_a)
spliting_preferences.require_escape(text_b)
diff = html_differ(text_a,text_b,splitting_preferences=spliting_preferences)
end = diff.diff_html()
return {"new_text":end,"patch":patch}
except get_context as con_req:
while ( not con_req.front_rule.match("".join(file_b[patch["start_b"]:patch["start_b"]+patch["length_b"]]))
or not con_req.front_rule.match("".join(file_a[patch["start_a"]:patch["start_a"]+patch["length_a"]])) ) \
and (patch["start_a"]>min_start_a and patch["start_b"]>min_start_b):
patch["start_a"] -= 1
patch["length_a"] += 1
patch["start_b"] -= 1
patch["length_b"] += 1
while not con_req.back_rule.match("".join(file_a[patch["start_a"]:patch["start_a"]+patch["length_a"]])):
patch["length_a"] += 1
while not con_req.back_rule.match("".join(file_b[patch["start_b"]:patch["start_b"]+patch["length_b"]])):
patch["length_b"] += 1
return process_patch(file_a,file_b,patch,spliting_preferences,min_start_a = min_start_a,min_start_b=min_start_b)
def process_file(patch_list,file_a,file_b,split_pref):
"""take in both files and a patch list of line numbers and create the html changes for each one"""
replacements = {}
end_a = 0
end_b = 0
log.INSTANCE.start_bar("computing patches",len(patch_list))
for i,patch in enumerate(patch_list):#for each patch make and run a html diff store change requests in the replacements
if(patch["start_a"] + patch["length_a"] < end_a or patch["start_b"] + patch["length_b"] < end_b):
log.INSTANCE.complete_work(1)
continue
while patch["start_a"] < end_a and patch["start_b"] < end_b:
patch["start_a"] += 1
patch["length_a"] -= 1
patch["start_b"] += 1
patch["length_b"] -= 1
if(patch["length_b"]<=0 or patch["length_a"]<=0):
log.INSTANCE.complete_work(1)
continue
result = process_patch(file_a,file_b,patch,split_pref,min_start_a=end_a,min_start_b=end_b)
log.INSTANCE.complete_work(1)
for i in range(result["patch"]["start_a"]+1,result["patch"]["start_a"]+result["patch"]["length_a"]):
replacements[i] = ""
replacements[result["patch"]["start_a"]] = result["new_text"]
end_a = result["patch"]["start_a"]+result["patch"]["length_a"]
end_b = result["patch"]["start_b"]+result["patch"]["length_b"]
log.INSTANCE.stop_bar()
def do_replacement(pack):#simple function to replace the lines in the original
line_number,line = pack
return replacements.get(line_number,line)
return list(map(do_replacement,[(i,line) for i,line in enumerate(file_a)]))
def preference_breaks_from_json(json_dict):
if(json_dict != {} ):
end = [re.compile(f'<{tag}[^/<>]*>.*?</{tag}[^/<>]*>') for tag in json_dict.get("break_tags",[])] + preference_breaks_from_json(json_dict.get("sub_rules",{}))
return end
return []
def preference_sub_breaks_from_json(json_dict):
if(json_dict != {} ):
rules = []
for key,value in json_dict.get("sub_break_tags",{}).items():
rules.append(
(
re.compile(f'<{key}[^/<>]*>.*?</{key}[^/<>]*>') , value
)
)
rules += preference_sub_breaks_from_json(json_dict.get("sub_rules",{}))
return rules
return []
def preference_sub_modify_inside_json(json_dict):
if(json_dict != {} ):
return [re.compile(f'<{tag}[^/<>]*>.*?</{tag}[^/<>]*>') for tag in json_dict.get("modify_inside",[])] + preference_sub_modify_inside_json(json_dict.get("sub_rules",{}))
return []
def preference_text_tags_from_json(json_dict):
if(json_dict != {} ):
return [re.compile(f'<{tag}[^/<>]*>.*?</{tag}[^/<>]*>') for tag in json_dict.get("text_tags",[])] + preference_text_tags_from_json(json_dict.get("sub_rules",{}))
return []
def preference_kept_tags_from_json(json_dict):
if(json_dict != {} ):
return [re.compile(f'</?{tag}[^/<>]*>') for tag in json_dict.get("kept_tags",[])]+ preference_kept_tags_from_json(json_dict.get("sub_rules",{}))
return []
def preference_no_diff_from_json(json_dict):
if(json_dict != {} ):
return [re.compile(f'<{tag}[^/<>]*>.*?</{tag}[^/<>]*>') for tag in json_dict.get("no_diff",[])]+preference_no_diff_from_json(json_dict.get("sub_rules",{}))
return []
def preference_escape_rules(json_dict):
if(json_dict != {} ):
all_rules = []
for key,values in json_dict.get("context_escapes",{}).items():
front = re.compile(f"^.*<{key}[^<>/]*?>.*")
back = re.compile(f".*</{key}[^<>/]*?>.*$")
for v in values:
rule = re.compile(f".*</?{v}[^<>/]*>.*")
all_rules.append((rule,front,back))
all_rules += preference_escape_rules(json_dict.get("sub_rules",{}))
return all_rules
return []
def preference_from_json(json_dict):
return splitting_preferences(
preference_breaks=preference_breaks_from_json(json_dict),
sub_breaks=preference_sub_breaks_from_json(json_dict),
modify_inside=preference_sub_modify_inside_json(json_dict),
tag_as_text=preference_text_tags_from_json(json_dict),
kept_tags=preference_kept_tags_from_json(json_dict),
escape_rules=preference_escape_rules(json_dict),
no_diff = preference_no_diff_from_json(json_dict),
sub_rules=preference_from_json(json_dict["sub_rules"]) if "sub_rules" in json_dict.keys() else splitting_preferences()
)
if __name__ == "__main__":
split_pref = preference_breaks_from_json({})
#with open('pref.json') as f:
# split_pref=preference_from_json(json.load(f))
parser = argparse.ArgumentParser(description = "Produces difference documents for html files in a git repository" )
parser.add_argument('from_commit',type=str,nargs=1,help='the pervious commit of the file')
parser.add_argument('to_commit',type=str,nargs=1,help='the pervious commit of the file')
parser.add_argument('file',type = str,nargs=1,help='the path to the html file to diff')
parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout,help="optional location of output file for the diffed html defaults to the console")
parser.add_argument('--pref',nargs='?', type=argparse.FileType('r'),help = "path to preference file")
parser.add_argument('--log',type=int,nargs='?',default=4,help="set the level of log output")
args = parser.parse_args()
log(args.log)
commit_a = args.from_commit[0]#"71ee350eb8806aa27c63829ef2141259c3c9538a"
commit_b = args.to_commit[0]#"3dbec14a113f0be7cb9db471768d2751d3bb9dca"
file_name = args.file[0]#"html/index.html"
if(args.pref):
split_pref=preference_from_json(json.load(args.pref))
line_nums = git_diff(commit_a,commit_b,file_name)
file_a = git_read_file(commit_a,file_name)
file_b = git_read_file(commit_b,file_name)
result = process_file(line_nums,file_a,file_b,split_pref)
args.outfile.reconfigure(encoding = 'utf-8')
for line in result:
args.outfile.write(line)