|
@@ -0,0 +1,288 @@
|
|
|
+
|
|
|
+from flask import Flask,request,jsonify,render_template,send_file,Response,redirect,url_for
|
|
|
+import base64
|
|
|
+from flask_bootstrap import Bootstrap
|
|
|
+import sqlite3
|
|
|
+import re
|
|
|
+from difflib import SequenceMatcher
|
|
|
+import PyPDF2
|
|
|
+import difflib
|
|
|
+import fitz
|
|
|
+from zipfile import ZipFile
|
|
|
+
|
|
|
+
|
|
|
+ALLOWED_EXTENSIONS = {'pdf'}
|
|
|
+app = Flask(__name__)
|
|
|
+Bootstrap(app)
|
|
|
+app.secret_key = '12345'
|
|
|
+app.run
|
|
|
+def allowed_file(filename):
|
|
|
+ return '.' in filename and \
|
|
|
+ filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
|
|
+
|
|
|
+@app.route('/uploadajax', methods=['GET', 'POST'])
|
|
|
+def upload_file():
|
|
|
+ sqliteConnection = sqlite3.connect('baaangt.db')
|
|
|
+ cursor = sqliteConnection.cursor()
|
|
|
+ print("Connected to SQLite")
|
|
|
+ cursor.execute('''CREATE TABLE IF NOT EXISTS files (
|
|
|
+ UUID integer PRIMARY KEY,
|
|
|
+ original_pdf_name NOT NULL,
|
|
|
+ original_pdf text NOT NULL,
|
|
|
+ reference_pdf_name NOT NULL,
|
|
|
+ reference_pdf text NOT NULL
|
|
|
+);''')
|
|
|
+ query = """ INSERT INTO files
|
|
|
+ (UUID,original_pdf_name, original_pdf,reference_pdf_name,reference_pdf) VALUES (?, ?, ?,?,?)"""
|
|
|
+ if request.method == 'POST':
|
|
|
+ if request.files:
|
|
|
+ file_orig = request.files['original']
|
|
|
+ file_ref = request.files['reference']
|
|
|
+ uuid = request.form['UUID']
|
|
|
+ files_json = [{'file_orig':file_orig.filename,'file_ref':file_ref.filename,'uuid':uuid}]
|
|
|
+ print(files_json)
|
|
|
+ print(allowed_file(file_orig.filename))
|
|
|
+ try:
|
|
|
+ if allowed_file(file_orig.filename) and allowed_file(file_ref.filename) and uuid:
|
|
|
+ blob_orig = base64.b64encode(file_orig.read())
|
|
|
+ blob_ref = base64.b64encode(file_ref.read())
|
|
|
+ cursor.execute('select uuid from files where uuid = {}'.format(int(uuid)))
|
|
|
+ data = [i for i in cursor.fetchall()]
|
|
|
+ if len(data)!= 0:
|
|
|
+ return Response("UUID Already Exists", status=400, mimetype='application/json')
|
|
|
+ data_tuple = (uuid,file_orig.filename,blob_orig,file_ref.filename,blob_ref)
|
|
|
+ cursor.execute(query,data_tuple)
|
|
|
+ sqliteConnection.commit()
|
|
|
+ cursor.close()
|
|
|
+ return jsonify(files_json)
|
|
|
+ else:
|
|
|
+ return Response("All fields must be selected", status=400, mimetype='application/json')
|
|
|
+ except Exception as e:
|
|
|
+ return Response("Error in uploading", status=400, mimetype='application/json')
|
|
|
+ else:
|
|
|
+ uuid_value = int(request.json['uuid'])
|
|
|
+ print(type(uuid_value))
|
|
|
+ cursor.execute('Delete from files where UUID = {}'.format(uuid_value))
|
|
|
+ sqliteConnection.commit()
|
|
|
+ print(request.json['uuid'])
|
|
|
+ else:
|
|
|
+ try:
|
|
|
+ cursor.execute('Select UUID, original_pdf_name,reference_pdf_name from files')
|
|
|
+ db_data = [i for i in cursor.fetchall()]
|
|
|
+ cursor.close()
|
|
|
+ # print(db_data)
|
|
|
+ return jsonify(db_data)
|
|
|
+ except:
|
|
|
+ return jsonify('')
|
|
|
+ return render_template('index.html')
|
|
|
+@app.route('/')
|
|
|
+def index():
|
|
|
+ return render_template('index.html')
|
|
|
+
|
|
|
+@app.route('/comparison/<uuid>',methods=['GET'])
|
|
|
+def comparison_(uuid):
|
|
|
+ try:
|
|
|
+ # uuid = int(request.json['uuid'])
|
|
|
+ uuid = int(uuid)
|
|
|
+ sqliteConnection = sqlite3.connect('baaangt.db')
|
|
|
+ cursor = sqliteConnection.cursor()
|
|
|
+ cursor.execute("Select UUID, original_pdf,reference_pdf from files where uuid = {}".format(uuid))
|
|
|
+ blob = cursor.fetchone()
|
|
|
+ blob_orig = base64.b64decode(blob[1])
|
|
|
+ blob_ref = base64.b64decode(blob[2])
|
|
|
+ with open('temp/temp_orig.pdf', 'wb') as f:
|
|
|
+ f.write(blob_orig)
|
|
|
+ with open('temp/temp_ref.pdf', 'wb') as f:
|
|
|
+ f.write(blob_ref)
|
|
|
+
|
|
|
+ input_file1 = 'temp/temp_orig.pdf'
|
|
|
+ input_file2 = 'temp/temp_ref.pdf'
|
|
|
+
|
|
|
+ output_file1 = 'output/Original_file.pdf'
|
|
|
+ output_file2 = 'output/Reference_file.pdf'
|
|
|
+ print('Comparing files ', input_file1, ' and ', input_file2, '.....')
|
|
|
+
|
|
|
+ fullText1 = ""
|
|
|
+
|
|
|
+ pdfFileObj = open(input_file1, 'rb')
|
|
|
+
|
|
|
+ #The pdfReader variable is a readable object that will be parsed
|
|
|
+ pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
|
|
|
+
|
|
|
+ #discerning the number of pages will allow us to parse through all #the pages
|
|
|
+ num_pages = pdfReader.numPages
|
|
|
+ count = 0
|
|
|
+ text = ""
|
|
|
+ pages1 = []
|
|
|
+
|
|
|
+ #The while loop will read each page
|
|
|
+ while count < num_pages:
|
|
|
+ pageObj = pdfReader.getPage(count)
|
|
|
+ count +=1
|
|
|
+ temp = pageObj.extractText()
|
|
|
+ text += temp
|
|
|
+ pages1.append(temp)
|
|
|
+
|
|
|
+ fullText1 = text
|
|
|
+ fullText1 = fullText1.replace('\n', ' ')
|
|
|
+ fullText1 = fullText1.replace(' \n', ' ')
|
|
|
+ fullText1 = re.sub(' +', ' ', fullText1)
|
|
|
+
|
|
|
+ while True:
|
|
|
+ try:
|
|
|
+ inz = fullText1.index('Seite')
|
|
|
+ temp = ' '.join(fullText1[inz:].split()[:4])
|
|
|
+ fullText1 = fullText1.replace(temp, '')
|
|
|
+ except:
|
|
|
+ break
|
|
|
+
|
|
|
+ fullText2 = ""
|
|
|
+
|
|
|
+ pdfFileObj = open(input_file2, 'rb')
|
|
|
+
|
|
|
+ #The pdfReader variable is a readable object that will be parsed
|
|
|
+ pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
|
|
|
+
|
|
|
+ #discerning the number of pages will allow us to parse through all #the pages
|
|
|
+ num_pages = pdfReader.numPages
|
|
|
+ count = 0
|
|
|
+ text = ""
|
|
|
+ pages2 = []
|
|
|
+
|
|
|
+ #The while loop will read each page
|
|
|
+ while count < num_pages:
|
|
|
+ pageObj = pdfReader.getPage(count)
|
|
|
+ count +=1
|
|
|
+ temp = pageObj.extractText()
|
|
|
+ text += temp
|
|
|
+ pages2.append(temp)
|
|
|
+
|
|
|
+
|
|
|
+ fullText2 = text
|
|
|
+ fullText2 = fullText2.replace('\n', ' ')
|
|
|
+ fullText2 = fullText2.replace(' \n', ' ')
|
|
|
+ fullText2 = re.sub(' +', ' ', fullText2)
|
|
|
+
|
|
|
+ while True:
|
|
|
+ try:
|
|
|
+ inz = fullText2.index('Seite')
|
|
|
+ temp = ' '.join(fullText2[inz:].split()[:4])
|
|
|
+ fullText2 = fullText2.replace(temp, '')
|
|
|
+ except:
|
|
|
+ break
|
|
|
+
|
|
|
+ str1 = fullText1
|
|
|
+ str2 = fullText2
|
|
|
+
|
|
|
+ delta = difflib.Differ().compare(str1.split(), str2.split())
|
|
|
+ # difflist = []
|
|
|
+ one = []
|
|
|
+ two = []
|
|
|
+
|
|
|
+
|
|
|
+ for line in delta:
|
|
|
+ if line[0] == '?':
|
|
|
+ continue
|
|
|
+ elif line[0] == ' ':
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ if line[0] == '-':
|
|
|
+ one.append(line[2:])
|
|
|
+ elif line[0] == '+':
|
|
|
+ two.append(line[2:])
|
|
|
+
|
|
|
+ # difflist.append(line)
|
|
|
+
|
|
|
+
|
|
|
+ # mix = [l[:] for l in '\n'.join(difflist).splitlines() if l]
|
|
|
+ one = [l[:] for l in '\n'.join(one).splitlines() if l]
|
|
|
+ two = [l[:] for l in '\n'.join(two).splitlines() if l]
|
|
|
+
|
|
|
+ one_text = ' '.join(one)
|
|
|
+ two_text = ' '.join(two)
|
|
|
+
|
|
|
+ one_final = one_text
|
|
|
+ two_final = two_text
|
|
|
+ matches = SequenceMatcher(None, one_text, two_text).get_matching_blocks()
|
|
|
+ for match in matches:
|
|
|
+ sen = one_text[match.a:match.a + match.size]
|
|
|
+ if len(sen) > 6:
|
|
|
+ # print(sen)
|
|
|
+ one_final = one_final.replace(sen, ' ', 1)
|
|
|
+ two_final = two_final.replace(sen, ' ', 1)
|
|
|
+
|
|
|
+ one_text = one_final
|
|
|
+ two_text = two_final
|
|
|
+
|
|
|
+ matches = SequenceMatcher(None, two_text, one_text).get_matching_blocks()
|
|
|
+ for match in matches:
|
|
|
+ sen = two_text[match.a:match.a + match.size]
|
|
|
+ if len(sen) > 6:
|
|
|
+ # print(sen)
|
|
|
+ one_final = one_final.replace(sen, ' ', 1)
|
|
|
+ two_final = two_final.replace(sen, ' ', 1)
|
|
|
+
|
|
|
+ print('Generating', output_file1, '.....')
|
|
|
+ one_list = one_final.split()
|
|
|
+
|
|
|
+ doc1 = fitz.open(input_file1)
|
|
|
+ page_no = 0
|
|
|
+ for word in one_list:
|
|
|
+ for i in range(page_no, len(pages1)):
|
|
|
+ if word in pages1[i]:
|
|
|
+ page = doc1[i]
|
|
|
+ text_instances = page.searchFor(word)
|
|
|
+ for inst in text_instances:
|
|
|
+ highlight = page.addHighlightAnnot(inst)
|
|
|
+ break
|
|
|
+ break
|
|
|
+ page_no += 1
|
|
|
+
|
|
|
+ if one_list[0].isdigit():
|
|
|
+ word = one_list[0]
|
|
|
+ for i in range(len(pages1)):
|
|
|
+ page = doc1[i]
|
|
|
+ text_instances = page.searchFor(word)
|
|
|
+ for inst in text_instances:
|
|
|
+ highlight = page.addHighlightAnnot(inst)
|
|
|
+ break
|
|
|
+
|
|
|
+ doc1.save(output_file1, garbage=4, deflate=True, clean=True)
|
|
|
+
|
|
|
+ print('Generating', output_file2, '.....')
|
|
|
+ two_list = two_final.split()
|
|
|
+
|
|
|
+ # for i, page in enumerate(pages1):
|
|
|
+ doc2 = fitz.open(input_file2)
|
|
|
+ page_no = 0
|
|
|
+ for word in two_list:
|
|
|
+ for i in range(page_no, len(pages2)):
|
|
|
+ if word in pages2[i]:
|
|
|
+ page = doc2[i]
|
|
|
+ text_instances = page.searchFor(word)
|
|
|
+ for inst in text_instances:
|
|
|
+ highlight = page.addHighlightAnnot(inst)
|
|
|
+ break
|
|
|
+ break
|
|
|
+ page_no += 1
|
|
|
+ if two_list[0].isdigit():
|
|
|
+ word = two_list[0]
|
|
|
+ for i in range(len(pages2)):
|
|
|
+ page = doc2[i]
|
|
|
+ text_instances = page.searchFor(word)
|
|
|
+ for inst in text_instances:
|
|
|
+ highlight = page.addHighlightAnnot(inst)
|
|
|
+ break
|
|
|
+ doc2.save(output_file2, garbage=4, deflate=True, clean=True)
|
|
|
+ zipObj = ZipFile('output/output.zip', 'w')
|
|
|
+ zipObj.write(output_file1)
|
|
|
+ zipObj.write(output_file2)
|
|
|
+ zipObj.close()
|
|
|
+ print('Finish')
|
|
|
+ except:
|
|
|
+ print('error in comparison')
|
|
|
+ return redirect(url_for('.index',message = 'error in comparison'))
|
|
|
+ return send_file('C:\\Users\\Siraj\\PycharmProjects\\baangt\\output\\output.zip',as_attachment=True)
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ app.run()
|