4 years ago · 7ae0f51d83
--- a/app.py
+++ b/app.py
@@ -0,0 +1,288 @@
 
				+
			
 
				+from flask import Flask,request,jsonify,render_template,send_file,Response,redirect,url_for
			
 
				+import base64
			
 
				+from flask_bootstrap import Bootstrap
			
 
				+import sqlite3
			
 
				+import re
			
 
				+from difflib import SequenceMatcher
			
 
				+import PyPDF2
			
 
				+import difflib
			
 
				+import fitz
			
 
				+from zipfile import ZipFile
			
 
				+
			
 
				+
			
 
				+ALLOWED_EXTENSIONS = {'pdf'}
			
 
				+app = Flask(__name__)
			
 
				+Bootstrap(app)
			
 
				+app.secret_key = '12345'
			
 
				+app.run
			
 
				+def allowed_file(filename):
			
 
				+    return '.' in filename and \
			
 
				+           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
			
 
				+
			
 
				+@app.route('/uploadajax', methods=['GET', 'POST'])
			
 
				+def upload_file():
			
 
				+    sqliteConnection = sqlite3.connect('baaangt.db')
			
 
				+    cursor = sqliteConnection.cursor()
			
 
				+    print("Connected to SQLite")
			
 
				+    cursor.execute('''CREATE TABLE IF NOT EXISTS files (
			
 
				+    UUID integer PRIMARY KEY,
			
 
				+    original_pdf_name NOT NULL, 
			
 
				+    original_pdf text NOT NULL,
			
 
				+    reference_pdf_name NOT NULL, 
			
 
				+    reference_pdf text NOT NULL 
			
 
				+);''')
			
 
				+    query = """ INSERT INTO files
			
 
				+                                  (UUID,original_pdf_name, original_pdf,reference_pdf_name,reference_pdf) VALUES (?, ?, ?,?,?)"""
			
 
				+    if request.method == 'POST':
			
 
				+        if request.files:
			
 
				+            file_orig = request.files['original']
			
 
				+            file_ref = request.files['reference']
			
 
				+            uuid = request.form['UUID']
			
 
				+            files_json = [{'file_orig':file_orig.filename,'file_ref':file_ref.filename,'uuid':uuid}]
			
 
				+            print(files_json)
			
 
				+            print(allowed_file(file_orig.filename))
			
 
				+            try:
			
 
				+                if allowed_file(file_orig.filename) and allowed_file(file_ref.filename) and uuid:
			
 
				+                    blob_orig = base64.b64encode(file_orig.read())
			
 
				+                    blob_ref = base64.b64encode(file_ref.read())
			
 
				+                    cursor.execute('select uuid from files where uuid = {}'.format(int(uuid)))
			
 
				+                    data = [i for i in cursor.fetchall()]
			
 
				+                    if len(data)!= 0:
			
 
				+                        return Response("UUID Already Exists", status=400, mimetype='application/json')
			
 
				+                    data_tuple = (uuid,file_orig.filename,blob_orig,file_ref.filename,blob_ref)
			
 
				+                    cursor.execute(query,data_tuple)
			
 
				+                    sqliteConnection.commit()
			
 
				+                    cursor.close()
			
 
				+                    return jsonify(files_json)
			
 
				+                else:
			
 
				+                    return Response("All fields must be selected", status=400, mimetype='application/json')
			
 
				+            except Exception as e:
			
 
				+                return Response("Error in uploading", status=400, mimetype='application/json')
			
 
				+        else:
			
 
				+            uuid_value = int(request.json['uuid'])
			
 
				+            print(type(uuid_value))
			
 
				+            cursor.execute('Delete from files where UUID = {}'.format(uuid_value))
			
 
				+            sqliteConnection.commit()
			
 
				+            print(request.json['uuid'])
			
 
				+    else:
			
 
				+        try:
			
 
				+            cursor.execute('Select UUID, original_pdf_name,reference_pdf_name from files')
			
 
				+            db_data = [i for i in cursor.fetchall()]
			
 
				+            cursor.close()
			
 
				+            # print(db_data)
			
 
				+            return jsonify(db_data)
			
 
				+        except:
			
 
				+            return jsonify('')
			
 
				+    return render_template('index.html')
			
 
				+@app.route('/')
			
 
				+def index():
			
 
				+    return render_template('index.html')
			
 
				+
			
 
				+@app.route('/comparison/<uuid>',methods=['GET'])
			
 
				+def comparison_(uuid):
			
 
				+    try:
			
 
				+    # uuid = int(request.json['uuid'])
			
 
				+        uuid = int(uuid)
			
 
				+        sqliteConnection = sqlite3.connect('baaangt.db')
			
 
				+        cursor = sqliteConnection.cursor()
			
 
				+        cursor.execute("Select UUID, original_pdf,reference_pdf from files where uuid = {}".format(uuid))
			
 
				+        blob = cursor.fetchone()
			
 
				+        blob_orig = base64.b64decode(blob[1])
			
 
				+        blob_ref = base64.b64decode(blob[2])
			
 
				+        with open('temp/temp_orig.pdf', 'wb') as f:
			
 
				+            f.write(blob_orig)
			
 
				+        with open('temp/temp_ref.pdf', 'wb') as f:
			
 
				+            f.write(blob_ref)
			
 
				+
			
 
				+        input_file1 = 'temp/temp_orig.pdf'
			
 
				+        input_file2 = 'temp/temp_ref.pdf'
			
 
				+
			
 
				+        output_file1 = 'output/Original_file.pdf'
			
 
				+        output_file2 = 'output/Reference_file.pdf'
			
 
				+        print('Comparing files ', input_file1, ' and ', input_file2, '.....')
			
 
				+
			
 
				+        fullText1 = ""
			
 
				+
			
 
				+        pdfFileObj = open(input_file1, 'rb')
			
 
				+
			
 
				+        #The pdfReader variable is a readable object that will be parsed
			
 
				+        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
			
 
				+
			
 
				+        #discerning the number of pages will allow us to parse through all #the pages
			
 
				+        num_pages = pdfReader.numPages
			
 
				+        count = 0
			
 
				+        text = ""
			
 
				+        pages1 = []
			
 
				+
			
 
				+        #The while loop will read each page
			
 
				+        while count < num_pages:
			
 
				+            pageObj = pdfReader.getPage(count)
			
 
				+            count +=1
			
 
				+            temp = pageObj.extractText()
			
 
				+            text += temp
			
 
				+            pages1.append(temp)
			
 
				+
			
 
				+        fullText1 = text
			
 
				+        fullText1 = fullText1.replace('\n', ' ')
			
 
				+        fullText1 = fullText1.replace(' \n', ' ')
			
 
				+        fullText1 = re.sub(' +', ' ', fullText1)
			
 
				+
			
 
				+        while True:
			
 
				+            try:
			
 
				+                inz = fullText1.index('Seite')
			
 
				+                temp = ' '.join(fullText1[inz:].split()[:4])
			
 
				+                fullText1 = fullText1.replace(temp, '')
			
 
				+            except:
			
 
				+                break
			
 
				+
			
 
				+        fullText2 = ""
			
 
				+
			
 
				+        pdfFileObj = open(input_file2, 'rb')
			
 
				+
			
 
				+        #The pdfReader variable is a readable object that will be parsed
			
 
				+        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
			
 
				+
			
 
				+        #discerning the number of pages will allow us to parse through all #the pages
			
 
				+        num_pages = pdfReader.numPages
			
 
				+        count = 0
			
 
				+        text = ""
			
 
				+        pages2 = []
			
 
				+
			
 
				+        #The while loop will read each page
			
 
				+        while count < num_pages:
			
 
				+            pageObj = pdfReader.getPage(count)
			
 
				+            count +=1
			
 
				+            temp = pageObj.extractText()
			
 
				+            text += temp
			
 
				+            pages2.append(temp)
			
 
				+
			
 
				+
			
 
				+        fullText2 = text
			
 
				+        fullText2 = fullText2.replace('\n', ' ')
			
 
				+        fullText2 = fullText2.replace(' \n', ' ')
			
 
				+        fullText2 = re.sub(' +', ' ', fullText2)
			
 
				+
			
 
				+        while True:
			
 
				+            try:
			
 
				+                inz = fullText2.index('Seite')
			
 
				+                temp = ' '.join(fullText2[inz:].split()[:4])
			
 
				+                fullText2 = fullText2.replace(temp, '')
			
 
				+            except:
			
 
				+                break
			
 
				+
			
 
				+        str1 = fullText1
			
 
				+        str2 = fullText2
			
 
				+
			
 
				+        delta = difflib.Differ().compare(str1.split(), str2.split())
			
 
				+        # difflist = []
			
 
				+        one = []
			
 
				+        two = []
			
 
				+
			
 
				+
			
 
				+        for line in delta:
			
 
				+            if line[0] == '?':
			
 
				+                continue
			
 
				+            elif line[0] == ' ':
			
 
				+                continue
			
 
				+            else:
			
 
				+                if line[0] == '-':
			
 
				+                    one.append(line[2:])
			
 
				+                elif line[0] == '+':
			
 
				+                    two.append(line[2:])
			
 
				+
			
 
				+                # difflist.append(line)
			
 
				+
			
 
				+
			
 
				+        # mix = [l[:] for l in '\n'.join(difflist).splitlines() if l]
			
 
				+        one = [l[:] for l in '\n'.join(one).splitlines() if l]
			
 
				+        two = [l[:] for l in '\n'.join(two).splitlines() if l]
			
 
				+
			
 
				+        one_text = ' '.join(one)
			
 
				+        two_text = ' '.join(two)
			
 
				+
			
 
				+        one_final = one_text
			
 
				+        two_final = two_text
			
 
				+        matches = SequenceMatcher(None, one_text, two_text).get_matching_blocks()
			
 
				+        for match in matches:
			
 
				+            sen = one_text[match.a:match.a + match.size]
			
 
				+            if len(sen) > 6:
			
 
				+                # print(sen)
			
 
				+                one_final = one_final.replace(sen, ' ', 1)
			
 
				+                two_final = two_final.replace(sen, ' ', 1)
			
 
				+
			
 
				+        one_text = one_final
			
 
				+        two_text = two_final
			
 
				+
			
 
				+        matches = SequenceMatcher(None, two_text, one_text).get_matching_blocks()
			
 
				+        for match in matches:
			
 
				+            sen = two_text[match.a:match.a + match.size]
			
 
				+            if len(sen) > 6:
			
 
				+                # print(sen)
			
 
				+                one_final = one_final.replace(sen, ' ', 1)
			
 
				+                two_final = two_final.replace(sen, ' ', 1)
			
 
				+
			
 
				+        print('Generating', output_file1, '.....')
			
 
				+        one_list = one_final.split()
			
 
				+
			
 
				+        doc1 = fitz.open(input_file1)
			
 
				+        page_no = 0
			
 
				+        for word in one_list:
			
 
				+            for i in range(page_no, len(pages1)):
			
 
				+                if word in pages1[i]:
			
 
				+                    page = doc1[i]
			
 
				+                    text_instances = page.searchFor(word)
			
 
				+                    for inst in text_instances:
			
 
				+                        highlight = page.addHighlightAnnot(inst)
			
 
				+                        break
			
 
				+                    break
			
 
				+                page_no += 1
			
 
				+
			
 
				+        if one_list[0].isdigit():
			
 
				+            word = one_list[0]
			
 
				+            for i in range(len(pages1)):
			
 
				+                page = doc1[i]
			
 
				+                text_instances = page.searchFor(word)
			
 
				+                for inst in text_instances:
			
 
				+                    highlight = page.addHighlightAnnot(inst)
			
 
				+                    break
			
 
				+
			
 
				+        doc1.save(output_file1, garbage=4, deflate=True, clean=True)
			
 
				+
			
 
				+        print('Generating', output_file2, '.....')
			
 
				+        two_list = two_final.split()
			
 
				+
			
 
				+        # for i, page in enumerate(pages1):
			
 
				+        doc2 = fitz.open(input_file2)
			
 
				+        page_no = 0
			
 
				+        for word in two_list:
			
 
				+            for i in range(page_no, len(pages2)):
			
 
				+                if word in pages2[i]:
			
 
				+                    page = doc2[i]
			
 
				+                    text_instances = page.searchFor(word)
			
 
				+                    for inst in text_instances:
			
 
				+                        highlight = page.addHighlightAnnot(inst)
			
 
				+                        break
			
 
				+                    break
			
 
				+                page_no += 1
			
 
				+        if two_list[0].isdigit():
			
 
				+            word = two_list[0]
			
 
				+            for i in range(len(pages2)):
			
 
				+                page = doc2[i]
			
 
				+                text_instances = page.searchFor(word)
			
 
				+                for inst in text_instances:
			
 
				+                    highlight = page.addHighlightAnnot(inst)
			
 
				+                    break
			
 
				+        doc2.save(output_file2, garbage=4, deflate=True, clean=True)
			
 
				+        zipObj = ZipFile('output/output.zip', 'w')
			
 
				+        zipObj.write(output_file1)
			
 
				+        zipObj.write(output_file2)
			
 
				+        zipObj.close()
			
 
				+        print('Finish')
			
 
				+    except:
			
 
				+        print('error in comparison')
			
 
				+        return redirect(url_for('.index',message = 'error in comparison'))
			
 
				+    return send_file('C:\\Users\\Siraj\\PycharmProjects\\baangt\\output\\output.zip',as_attachment=True)
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    app.run()
			
--- a/output/Original_file.pdf
+++ b/output/Original_file.pdf
--- a/output/Reference_file.pdf
+++ b/output/Reference_file.pdf
--- a/output/output.zip
+++ b/output/output.zip
--- a/temp/temp_orig.pdf
+++ b/temp/temp_orig.pdf
--- a/temp/temp_ref.pdf
+++ b/temp/temp_ref.pdf
--- a/templates/index.html
+++ b/templates/index.html
@@ -0,0 +1,151 @@
 
				+{##}
			
 
				+{# This simple template derives from ``base.html``. See ``base.html`` for
			
 
				+{#   more information about template inheritance. #}
			
 
				+{#{%- extends "bootstrap/base.html" %}#}
			
 
				+{##}
			
 
				+{# Loads some of the macros included with Flask-Bootstrap. We are using the
			
 
				+{#   utils module here to automatically render Flask's flashed messages in a#}
			
 
				+{#   bootstrap friendly manner #}
			
 
				+{% import "bootstrap/utils.html" as utils %}
			
 
				+
			
 
				+
			
 
				+
			
 
				+{% block content %}
			
 
				+<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
			
 
				+    <link href="https://cdn.datatables.net/1.10.20/css/dataTables.bootstrap4.min.css" rel="stylesheet">
			
 
				+     <script src="//cdnjs.cloudflare.com/ajax/libs/jquery/3.2.1/jquery.min.js"></script>
			
 
				+  <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.16.0/umd/popper.min.js"></script>
			
 
				+    <script src="https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js"></script>
			
 
				+    <script src="https://cdn.datatables.net/1.10.20/js/dataTables.bootstrap4.min.js"></script>
			
 
				+
			
 
				+
			
 
				+  <div class="container">
			
 
				+    <div class="row">
			
 
				+      <div class="col-md-12">
			
 
				+        <div id="msg" class="alert alert-primary" role="alert"></div>
			
 
				+      </div>
			
 
				+    </div>
			
 
				+    <div class="jumbotron">
			
 
				+      <h1>Upload new File</h1>
			
 
				+    <form id="upload-file" method="post" enctype="multipart/form-data" onsubmit="return false">
			
 
				+        <div class="form-group">
			
 
				+  <label for="uuid">UUID:</label>
			
 
				+  <input type="text" class="form-control" name="UUID" id="uuid">
			
 
				+</div>
			
 
				+      <div class="custom-file">
			
 
				+    <input type="file" class="custom-file-input" name="original" id="customFile1">
			
 
				+    <label class="custom-file-label" id="custom-file-label1" for="customFile">Choose Original file</label>
			
 
				+  </div>
			
 
				+       <div class="custom-file">
			
 
				+    <input type="file" class="custom-file-input" name="reference" id="customFile2">
			
 
				+    <label class="custom-file-label" id="custom-file-label2" for="customFile">Choose Reference file</label>
			
 
				+  </div>
			
 
				+        <hr>
			
 
				+        <br>
			
 
				+
			
 
				+      <button id="sub" class="btn btn-primary"  >Upload</button>
			
 
				+    </form>
			
 
				+    </div>
			
 
				+  <table id="example" class="table table-striped table-bordered" style="width:100%">
			
 
				+        <thead>
			
 
				+            <tr>
			
 
				+                <th>UUID</th>
			
 
				+                <th>Original File</th>
			
 
				+                <th>Reference File</th>
			
 
				+                <th>Delete/Compare</th>
			
 
				+            </tr>
			
 
				+        </thead>
			
 
				+      <tbody>
			
 
				+
			
 
				+
			
 
				+      </tbody>
			
 
				+  </table>
			
 
				+   </div>
			
 
				+
			
 
				+    <script>
			
 
				+
			
 
				+        {% if  message %}
			
 
				+             alert({{ message }})
			
 
				+        {% endif %}
			
 
				+
			
 
				+// Add the following code if you want the name of the file appear on select
			
 
				+$("#customFile1").on("change", function() {
			
 
				+  var fileName = $(this).val().split("\\").pop();
			
 
				+  $(this).siblings("#custom-file-label1").addClass("selected").html(fileName);
			
 
				+});
			
 
				+$("#customFile2").on("change", function() {
			
 
				+  var fileName = $(this).val().split("\\").pop();
			
 
				+  $(this).siblings("#custom-file-label2").addClass("selected").html(fileName);
			
 
				+});
			
 
				+
			
 
				+var data_tables = $('#example').DataTable();
			
 
				+
			
 
				+$('#example').on('click', 'a.Delete', function (e) {
			
 
				+        e.preventDefault();
			
 
				+        var uuid = data_tables.row( $(this).parents('tr')).data()[0];
			
 
				+        var json_text = {'uuid':uuid}
			
 
				+         $.ajax({
			
 
				+        url: '/uploadajax',
			
 
				+        type:'POST',
			
 
				+        data:JSON.stringify(json_text),
			
 
				+        contentType: 'application/json;charset=UTF-8',
			
 
				+        success: function() {
			
 
				+              console.log('deleted')
			
 
				+        }
			
 
				+    });
			
 
				+        data_tables
			
 
				+            .row( $(this).parents('tr') )
			
 
				+            .remove()
			
 
				+            .draw();
			
 
				+    } );
			
 
				+
			
 
				+
			
 
				+
			
 
				+$(document).ready(function() {
			
 
				+    $('#msg').hide()
			
 
				+    $.ajax({
			
 
				+        url: '/uploadajax',
			
 
				+        type:'GET',
			
 
				+        success: function(data) {
			
 
				+              $.each(data, function (key, item) {
			
 
				+                          var buton = '<a href = "" class="Delete">Delete</a>/<a href = "/comparison/'+ item[0]+'" class="Compare">Compare</a>';
			
 
				+                          data_tables.row.add([item[0],item[1],item[2],buton]).draw(true)
			
 
				+                        });
			
 
				+
			
 
				+        }
			
 
				+    });
			
 
				+} );
			
 
				+
			
 
				+
			
 
				+$(function() {
			
 
				+    $('#sub').click(function() {
			
 
				+        var form_data = new FormData($('#upload-file')[0]);
			
 
				+        $.ajax({
			
 
				+            type: 'POST',
			
 
				+            url: '/uploadajax',
			
 
				+            data: form_data,
			
 
				+            contentType: false,
			
 
				+            cache: false,
			
 
				+            processData: false,
			
 
				+            success: function(data) {
			
 
				+                $('#msg').show();
			
 
				+                $('#msg').text('Success');
			
 
				+
			
 
				+              $.each(data, function (key, item) {
			
 
				+                  var buton = '<a href = "" class="Delete">Delete</a>/<a href = "/comparison/'+ item["uuid"]+'" class="Compare">Compare</a>';
			
 
				+                  data_tables.row.add([item['uuid'],item['file_orig'],item['file_ref'],buton]).draw(true)
			
 
				+              });
			
 
				+            },
			
 
				+            statusCode: {
			
 
				+        400: function(data) {
			
 
				+                $('#msg').html(data.responseText);
			
 
				+                $('#msg').show();
			
 
				+    }}
			
 
				+
			
 
				+        });
			
 
				+    });
			
 
				+});
			
 
				+
			
 
				+
			
 
				+</script>
			
 
				+{%- endblock %}