Adeel 4 years ago
parent
commit
7ae0f51d83
7 changed files with 439 additions and 0 deletions
  1. 288 0
      app.py
  2. BIN
      output/Original_file.pdf
  3. BIN
      output/Reference_file.pdf
  4. BIN
      output/output.zip
  5. BIN
      temp/temp_orig.pdf
  6. BIN
      temp/temp_ref.pdf
  7. 151 0
      templates/index.html

+ 288 - 0
app.py

@@ -0,0 +1,288 @@
+
+from flask import Flask,request,jsonify,render_template,send_file,Response,redirect,url_for
+import base64
+from flask_bootstrap import Bootstrap
+import sqlite3
+import re
+from difflib import SequenceMatcher
+import PyPDF2
+import difflib
+import fitz
+from zipfile import ZipFile
+
+
+ALLOWED_EXTENSIONS = {'pdf'}
+app = Flask(__name__)
+Bootstrap(app)
+app.secret_key = '12345'
+app.run
+def allowed_file(filename):
+    return '.' in filename and \
+           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+
+@app.route('/uploadajax', methods=['GET', 'POST'])
+def upload_file():
+    sqliteConnection = sqlite3.connect('baaangt.db')
+    cursor = sqliteConnection.cursor()
+    print("Connected to SQLite")
+    cursor.execute('''CREATE TABLE IF NOT EXISTS files (
+    UUID integer PRIMARY KEY,
+    original_pdf_name NOT NULL, 
+    original_pdf text NOT NULL,
+    reference_pdf_name NOT NULL, 
+    reference_pdf text NOT NULL 
+);''')
+    query = """ INSERT INTO files
+                                  (UUID,original_pdf_name, original_pdf,reference_pdf_name,reference_pdf) VALUES (?, ?, ?,?,?)"""
+    if request.method == 'POST':
+        if request.files:
+            file_orig = request.files['original']
+            file_ref = request.files['reference']
+            uuid = request.form['UUID']
+            files_json = [{'file_orig':file_orig.filename,'file_ref':file_ref.filename,'uuid':uuid}]
+            print(files_json)
+            print(allowed_file(file_orig.filename))
+            try:
+                if allowed_file(file_orig.filename) and allowed_file(file_ref.filename) and uuid:
+                    blob_orig = base64.b64encode(file_orig.read())
+                    blob_ref = base64.b64encode(file_ref.read())
+                    cursor.execute('select uuid from files where uuid = {}'.format(int(uuid)))
+                    data = [i for i in cursor.fetchall()]
+                    if len(data)!= 0:
+                        return Response("UUID Already Exists", status=400, mimetype='application/json')
+                    data_tuple = (uuid,file_orig.filename,blob_orig,file_ref.filename,blob_ref)
+                    cursor.execute(query,data_tuple)
+                    sqliteConnection.commit()
+                    cursor.close()
+                    return jsonify(files_json)
+                else:
+                    return Response("All fields must be selected", status=400, mimetype='application/json')
+            except Exception as e:
+                return Response("Error in uploading", status=400, mimetype='application/json')
+        else:
+            uuid_value = int(request.json['uuid'])
+            print(type(uuid_value))
+            cursor.execute('Delete from files where UUID = {}'.format(uuid_value))
+            sqliteConnection.commit()
+            print(request.json['uuid'])
+    else:
+        try:
+            cursor.execute('Select UUID, original_pdf_name,reference_pdf_name from files')
+            db_data = [i for i in cursor.fetchall()]
+            cursor.close()
+            # print(db_data)
+            return jsonify(db_data)
+        except:
+            return jsonify('')
+    return render_template('index.html')
+@app.route('/')
+def index():
+    return render_template('index.html')
+
+@app.route('/comparison/<uuid>',methods=['GET'])
+def comparison_(uuid):
+    try:
+    # uuid = int(request.json['uuid'])
+        uuid = int(uuid)
+        sqliteConnection = sqlite3.connect('baaangt.db')
+        cursor = sqliteConnection.cursor()
+        cursor.execute("Select UUID, original_pdf,reference_pdf from files where uuid = {}".format(uuid))
+        blob = cursor.fetchone()
+        blob_orig = base64.b64decode(blob[1])
+        blob_ref = base64.b64decode(blob[2])
+        with open('temp/temp_orig.pdf', 'wb') as f:
+            f.write(blob_orig)
+        with open('temp/temp_ref.pdf', 'wb') as f:
+            f.write(blob_ref)
+
+        input_file1 = 'temp/temp_orig.pdf'
+        input_file2 = 'temp/temp_ref.pdf'
+
+        output_file1 = 'output/Original_file.pdf'
+        output_file2 = 'output/Reference_file.pdf'
+        print('Comparing files ', input_file1, ' and ', input_file2, '.....')
+
+        fullText1 = ""
+
+        pdfFileObj = open(input_file1, 'rb')
+
+        #The pdfReader variable is a readable object that will be parsed
+        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
+
+        #discerning the number of pages will allow us to parse through all #the pages
+        num_pages = pdfReader.numPages
+        count = 0
+        text = ""
+        pages1 = []
+
+        #The while loop will read each page
+        while count < num_pages:
+            pageObj = pdfReader.getPage(count)
+            count +=1
+            temp = pageObj.extractText()
+            text += temp
+            pages1.append(temp)
+
+        fullText1 = text
+        fullText1 = fullText1.replace('\n', ' ')
+        fullText1 = fullText1.replace(' \n', ' ')
+        fullText1 = re.sub(' +', ' ', fullText1)
+
+        while True:
+            try:
+                inz = fullText1.index('Seite')
+                temp = ' '.join(fullText1[inz:].split()[:4])
+                fullText1 = fullText1.replace(temp, '')
+            except:
+                break
+
+        fullText2 = ""
+
+        pdfFileObj = open(input_file2, 'rb')
+
+        #The pdfReader variable is a readable object that will be parsed
+        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
+
+        #discerning the number of pages will allow us to parse through all #the pages
+        num_pages = pdfReader.numPages
+        count = 0
+        text = ""
+        pages2 = []
+
+        #The while loop will read each page
+        while count < num_pages:
+            pageObj = pdfReader.getPage(count)
+            count +=1
+            temp = pageObj.extractText()
+            text += temp
+            pages2.append(temp)
+
+
+        fullText2 = text
+        fullText2 = fullText2.replace('\n', ' ')
+        fullText2 = fullText2.replace(' \n', ' ')
+        fullText2 = re.sub(' +', ' ', fullText2)
+
+        while True:
+            try:
+                inz = fullText2.index('Seite')
+                temp = ' '.join(fullText2[inz:].split()[:4])
+                fullText2 = fullText2.replace(temp, '')
+            except:
+                break
+
+        str1 = fullText1
+        str2 = fullText2
+
+        delta = difflib.Differ().compare(str1.split(), str2.split())
+        # difflist = []
+        one = []
+        two = []
+
+
+        for line in delta:
+            if line[0] == '?':
+                continue
+            elif line[0] == ' ':
+                continue
+            else:
+                if line[0] == '-':
+                    one.append(line[2:])
+                elif line[0] == '+':
+                    two.append(line[2:])
+
+                # difflist.append(line)
+
+
+        # mix = [l[:] for l in '\n'.join(difflist).splitlines() if l]
+        one = [l[:] for l in '\n'.join(one).splitlines() if l]
+        two = [l[:] for l in '\n'.join(two).splitlines() if l]
+
+        one_text = ' '.join(one)
+        two_text = ' '.join(two)
+
+        one_final = one_text
+        two_final = two_text
+        matches = SequenceMatcher(None, one_text, two_text).get_matching_blocks()
+        for match in matches:
+            sen = one_text[match.a:match.a + match.size]
+            if len(sen) > 6:
+                # print(sen)
+                one_final = one_final.replace(sen, ' ', 1)
+                two_final = two_final.replace(sen, ' ', 1)
+
+        one_text = one_final
+        two_text = two_final
+
+        matches = SequenceMatcher(None, two_text, one_text).get_matching_blocks()
+        for match in matches:
+            sen = two_text[match.a:match.a + match.size]
+            if len(sen) > 6:
+                # print(sen)
+                one_final = one_final.replace(sen, ' ', 1)
+                two_final = two_final.replace(sen, ' ', 1)
+
+        print('Generating', output_file1, '.....')
+        one_list = one_final.split()
+
+        doc1 = fitz.open(input_file1)
+        page_no = 0
+        for word in one_list:
+            for i in range(page_no, len(pages1)):
+                if word in pages1[i]:
+                    page = doc1[i]
+                    text_instances = page.searchFor(word)
+                    for inst in text_instances:
+                        highlight = page.addHighlightAnnot(inst)
+                        break
+                    break
+                page_no += 1
+
+        if one_list[0].isdigit():
+            word = one_list[0]
+            for i in range(len(pages1)):
+                page = doc1[i]
+                text_instances = page.searchFor(word)
+                for inst in text_instances:
+                    highlight = page.addHighlightAnnot(inst)
+                    break
+
+        doc1.save(output_file1, garbage=4, deflate=True, clean=True)
+
+        print('Generating', output_file2, '.....')
+        two_list = two_final.split()
+
+        # for i, page in enumerate(pages1):
+        doc2 = fitz.open(input_file2)
+        page_no = 0
+        for word in two_list:
+            for i in range(page_no, len(pages2)):
+                if word in pages2[i]:
+                    page = doc2[i]
+                    text_instances = page.searchFor(word)
+                    for inst in text_instances:
+                        highlight = page.addHighlightAnnot(inst)
+                        break
+                    break
+                page_no += 1
+        if two_list[0].isdigit():
+            word = two_list[0]
+            for i in range(len(pages2)):
+                page = doc2[i]
+                text_instances = page.searchFor(word)
+                for inst in text_instances:
+                    highlight = page.addHighlightAnnot(inst)
+                    break
+        doc2.save(output_file2, garbage=4, deflate=True, clean=True)
+        zipObj = ZipFile('output/output.zip', 'w')
+        zipObj.write(output_file1)
+        zipObj.write(output_file2)
+        zipObj.close()
+        print('Finish')
+    except:
+        print('error in comparison')
+        return redirect(url_for('.index',message = 'error in comparison'))
+    return send_file('C:\\Users\\Siraj\\PycharmProjects\\baangt\\output\\output.zip',as_attachment=True)
+
+if __name__ == "__main__":
+    app.run()

BIN
output/Original_file.pdf


BIN
output/Reference_file.pdf


BIN
output/output.zip


BIN
temp/temp_orig.pdf


BIN
temp/temp_ref.pdf


+ 151 - 0
templates/index.html

@@ -0,0 +1,151 @@
+{##}
+{# This simple template derives from ``base.html``. See ``base.html`` for
+{#   more information about template inheritance. #}
+{#{%- extends "bootstrap/base.html" %}#}
+{##}
+{# Loads some of the macros included with Flask-Bootstrap. We are using the
+{#   utils module here to automatically render Flask's flashed messages in a#}
+{#   bootstrap friendly manner #}
+{% import "bootstrap/utils.html" as utils %}
+
+
+
+{% block content %}
+<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
+    <link href="https://cdn.datatables.net/1.10.20/css/dataTables.bootstrap4.min.css" rel="stylesheet">
+     <script src="//cdnjs.cloudflare.com/ajax/libs/jquery/3.2.1/jquery.min.js"></script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.16.0/umd/popper.min.js"></script>
+    <script src="https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js"></script>
+    <script src="https://cdn.datatables.net/1.10.20/js/dataTables.bootstrap4.min.js"></script>
+
+
+  <div class="container">
+    <div class="row">
+      <div class="col-md-12">
+        <div id="msg" class="alert alert-primary" role="alert"></div>
+      </div>
+    </div>
+    <div class="jumbotron">
+      <h1>Upload new File</h1>
+    <form id="upload-file" method="post" enctype="multipart/form-data" onsubmit="return false">
+        <div class="form-group">
+  <label for="uuid">UUID:</label>
+  <input type="text" class="form-control" name="UUID" id="uuid">
+</div>
+      <div class="custom-file">
+    <input type="file" class="custom-file-input" name="original" id="customFile1">
+    <label class="custom-file-label" id="custom-file-label1" for="customFile">Choose Original file</label>
+  </div>
+       <div class="custom-file">
+    <input type="file" class="custom-file-input" name="reference" id="customFile2">
+    <label class="custom-file-label" id="custom-file-label2" for="customFile">Choose Reference file</label>
+  </div>
+        <hr>
+        <br>
+
+      <button id="sub" class="btn btn-primary"  >Upload</button>
+    </form>
+    </div>
+  <table id="example" class="table table-striped table-bordered" style="width:100%">
+        <thead>
+            <tr>
+                <th>UUID</th>
+                <th>Original File</th>
+                <th>Reference File</th>
+                <th>Delete/Compare</th>
+            </tr>
+        </thead>
+      <tbody>
+
+
+      </tbody>
+  </table>
+   </div>
+
+    <script>
+
+        {% if  message %}
+             alert({{ message }})
+        {% endif %}
+
+// Add the following code if you want the name of the file appear on select
+$("#customFile1").on("change", function() {
+  var fileName = $(this).val().split("\\").pop();
+  $(this).siblings("#custom-file-label1").addClass("selected").html(fileName);
+});
+$("#customFile2").on("change", function() {
+  var fileName = $(this).val().split("\\").pop();
+  $(this).siblings("#custom-file-label2").addClass("selected").html(fileName);
+});
+
+var data_tables = $('#example').DataTable();
+
+$('#example').on('click', 'a.Delete', function (e) {
+        e.preventDefault();
+        var uuid = data_tables.row( $(this).parents('tr')).data()[0];
+        var json_text = {'uuid':uuid}
+         $.ajax({
+        url: '/uploadajax',
+        type:'POST',
+        data:JSON.stringify(json_text),
+        contentType: 'application/json;charset=UTF-8',
+        success: function() {
+              console.log('deleted')
+        }
+    });
+        data_tables
+            .row( $(this).parents('tr') )
+            .remove()
+            .draw();
+    } );
+
+
+
+$(document).ready(function() {
+    $('#msg').hide()
+    $.ajax({
+        url: '/uploadajax',
+        type:'GET',
+        success: function(data) {
+              $.each(data, function (key, item) {
+                          var buton = '<a href = "" class="Delete">Delete</a>/<a href = "/comparison/'+ item[0]+'" class="Compare">Compare</a>';
+                          data_tables.row.add([item[0],item[1],item[2],buton]).draw(true)
+                        });
+
+        }
+    });
+} );
+
+
+$(function() {
+    $('#sub').click(function() {
+        var form_data = new FormData($('#upload-file')[0]);
+        $.ajax({
+            type: 'POST',
+            url: '/uploadajax',
+            data: form_data,
+            contentType: false,
+            cache: false,
+            processData: false,
+            success: function(data) {
+                $('#msg').show();
+                $('#msg').text('Success');
+
+              $.each(data, function (key, item) {
+                  var buton = '<a href = "" class="Delete">Delete</a>/<a href = "/comparison/'+ item["uuid"]+'" class="Compare">Compare</a>';
+                  data_tables.row.add([item['uuid'],item['file_orig'],item['file_ref'],buton]).draw(true)
+              });
+            },
+            statusCode: {
+        400: function(data) {
+                $('#msg').html(data.responseText);
+                $('#msg').show();
+    }}
+
+        });
+    });
+});
+
+
+</script>
+{%- endblock %}