2018-02-25 16:20:58 +01:00
|
|
|
#!/usr/bin/env PYTHONUNBUFFERED=1 python2
|
2014-12-01 23:39:42 +01:00
|
|
|
#
|
|
|
|
# Created by Bjarni R. Einarsson, placed in the public domain. Go wild!
|
|
|
|
#
|
|
|
|
import json
|
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
|
|
|
|
try:
|
|
|
|
dirtydb_input = sys.argv[1]
|
|
|
|
dirtydb_output = '%s.new' % dirtydb_input
|
|
|
|
assert(os.path.exists(dirtydb_input))
|
|
|
|
assert(not os.path.exists(dirtydb_output))
|
|
|
|
except:
|
|
|
|
print
|
|
|
|
print 'Usage: %s /path/to/dirty.db' % sys.argv[0]
|
|
|
|
print
|
|
|
|
print 'Note: Will create a file named dirty.db.new in the same folder,'
|
|
|
|
print ' please make sure permissions are OK and a file by that'
|
|
|
|
print ' name does not exist already. This script works by omitting'
|
|
|
|
print ' duplicate lines from the dirty.db file, keeping only the'
|
|
|
|
print ' last (latest) instance. No revision data should be lost,'
|
|
|
|
print ' but be careful, make backups. If it breaks you get to keep'
|
|
|
|
print ' both pieces!'
|
|
|
|
print
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
dirtydb = {}
|
|
|
|
lines = 0
|
|
|
|
with open(dirtydb_input, 'r') as fd:
|
|
|
|
print 'Reading %s' % dirtydb_input
|
|
|
|
for line in fd:
|
|
|
|
lines += 1
|
2018-11-02 20:10:36 +01:00
|
|
|
try:
|
|
|
|
data = json.loads(line)
|
|
|
|
dirtydb[data['key']] = line
|
|
|
|
except:
|
|
|
|
print("Skipping invalid JSON!")
|
2014-12-01 23:39:42 +01:00
|
|
|
if lines % 10000 == 0:
|
|
|
|
sys.stderr.write('.')
|
|
|
|
print
|
|
|
|
print 'OK, found %d unique keys in %d lines' % (len(dirtydb), lines)
|
|
|
|
|
|
|
|
with open(dirtydb_output, 'w') as fd:
|
|
|
|
for data in dirtydb.values():
|
|
|
|
fd.write(data)
|
|
|
|
|
|
|
|
print 'Wrote data to %s. All done!' % dirtydb_output
|