diff --git a/compression.py b/compression.py new file mode 100644 index 0000000..584dfdb --- /dev/null +++ b/compression.py @@ -0,0 +1,131 @@ +import time +import binascii +uncompressed = input("Binary input: ") # BINARY MUST BE SEPARATED BY COMMA +if (uncompressed == 'file'): + try: + #with open('C://Users//Ob//Desktop//binary-output2.txt', 'r') as file: + with open(input("path to file: "), 'r') as file: + uncompressed = file.read().replace('\n', '') + except: + print("that wasn't a valid file.") + time.sleep(2) + quit() + file.close() + print("file length:", len(uncompressed)) + time.sleep(2) + +def string2bits (string): + return str(bin(int.from_bytes(string.encode(), 'big'))).replace("0b", "") +def stl (lst, n): + return [lst[i:i + n] for i in range(0, len(lst), n)] # Converts a string into a list with each item being n long. +def lts(s): + str1 = "" + for ele in s: + str1 += ele + return str1 +def bits2string(b=None): + #return ''.join([chr(int(x, 2)) for x in b]) + #[line[i:i+n] for i in range(0, len(line), n)] + return ''.join([chr(int(x, 2)) for x in [b[i:i+8] for i in range(0, len(b), 8)]]) # Black magic. Specific to 8-bit. +def toBinary(n, bits): + return ''.join(str(1 & int(n) >> i) for i in range(bits)[::-1]) # Also black magic. Do not question the Python gods. Also specific to 8-bit. +uncompressed = uncompressed.split(',') +print("Transferring infromation...") +compressed = [] +for i in uncompressed: + compressed.append(i) + +def bytecount (quotes,data): + if (len(lts(data))/8 < 1000): + print(quotes, len(lts(data))/8, "bytes") + elif (len(lts(data))/8 < 1000000): + print(quotes, len(lts(data))/8000, "kilobytes") + else: + print(quotes, len(lts(data))/8000000, "megabytes") +bytecount("uncompressed file size:", uncompressed) + +indexvars = [] +indexID = 0 +errortest = 0 +skiploop = False +matchstore = [] +maxscan = 0 + +for i in range(0, len(uncompressed)): + if (i > maxscan and uncompressed.count(str(uncompressed[i])) > 1): # Checks to see if the letter has been used more than once + matchcount = str(uncompressed[i]) # Prepares to record + io = 0 + + try: + errortest = uncompressed[i+1] # Checks to make sure there won't be an 'index out of range' error + except: + skiploop = True + else: + skiploop = False + #if (skiploop == False and uncompressed.count(str(uncompressed[i+1])) > 1): + while ((not skiploop == True) and lts(uncompressed).count(matchcount + str(uncompressed[i+io+1])) > 1): + try: + errortest = uncompressed[i+io+2] # Checks to make sure there won't be an 'index out of range' error next loop cycle + except: + skiploop = True # If there's going to be an 'index out of range' error next loop cycle, bail. Exit the loop after this cycle. + io += 1 + matchcount = matchcount + str(uncompressed[i+io]) + #matchcount = string2bits(bits2string(matchcount).strip()) # The reason why so many patterns are being ignored in the replacement process is because other patterns are colliding with them. + matchcount = str(matchcount) + maxscan = i + io + + skipif = False + #if (matchstore in str(matchcount)+str(matchcount)+str(matchcount)): skipif = True # We don't want 'egg ', ' egg', 'g eg', and 'gg e' to be separate occurences. + if (matchcount in lts(indexvars)): skipif = True # We don't want 'egg ', ' egg', 'g eg', and 'gg e' to be separate occurences. + for x in matchstore: + if x in matchcount: + skipif = True + + if (skipif == False and lts(uncompressed).count(matchcount) * (len(matchcount) - 40) > len(matchcount) + 8):# and len(matchcount) > 40 and lts(uncompressed).count(matchcount) > 2 + matchstore.append(str(matchcount)) + indexvars.append('00000000') + indexvars.append(matchcount) + compressed = lts(compressed) + compressed = compressed.replace(matchcount, '11111111' + str(toBinary(indexID, 32))) + compressed = stl(compressed, 8) # Makes it back into a list + indexID += 1 + print('[' + bits2string(matchcount) + ']', '(' + str(lts(uncompressed).count(matchcount)), 'instances)', 'progress:', str(int(i/len(lts(uncompressed))*10000)/10) + '% done') + if (indexID > 4294967295): # Maximum 32bit integer... Can guaruntee that no one will need over 4 billion patterns + print("Pattern index overflow. More memory required.") + if (len(indexvars) > 0): indexvars.append('0000000000000000') # Two NULL characters separate indexvars from compressed. + compressed[0:0] = indexvars + bytecount("uncompressed file size:", uncompressed) + bytecount("compressed file size:", compressed) + quit() + +print(indexvars) +print() +print(compressed) +print() +print(bits2string(lts(compressed))) +print() + +if (len(indexvars) > 0): indexvars.append('0000000000000000') # Two NULL characters separate indexvars from compressed. +compressed[0:0] = indexvars + +print() +bytecount("uncompressed file size:", uncompressed) +bytecount("compressed file size:", compressed) +print() # Note: Made indexID 32bit. 4 null characters after 11111111 does not separate indexvars from compressed; It only does that if 0000000000000000 is not after 11111111. + +filename = str(input("name output file (leave blank to not save and don't include file extensions): ")) +def saveExport (): + filepath = str(input("path to export folder (use '//' to separate): ")) + try: + global f + f= open(filepath + filename + ".txt","w+") + except: + print("mistakes have been made.") + saveExport() + +if (len(filename) > 0): + saveExport() + print("saving... (This could take a while.)") + f.write(','.join(lts(compressed)[i:i + 8] for i in range(0, len(lts(compressed)), 8))) + f.close() + print("done saving!")