Spaces:
Sleeping
Sleeping
Commit
·
e4a65f3
1
Parent(s):
db9597b
adding QA
Browse files
app.py
CHANGED
@@ -1,18 +1,111 @@
|
|
1 |
import gradio as gr
|
2 |
from transformers import pipeline
|
3 |
-
|
4 |
-
|
5 |
-
question = "How many uncovered bridges are there?"
|
6 |
|
7 |
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
#result = question_answerer(question = question, context=context)
|
10 |
#return result['answer']
|
11 |
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
theme = "grass",
|
18 |
-
examples = [[context, question]]).launch()
|
|
|
1 |
import gradio as gr
|
2 |
from transformers import pipeline
|
3 |
+
from transformers import MarkupLMTokenizer, MarkupLMModel, MarkupLMForQuestionAnswering
|
4 |
+
import torch
|
|
|
5 |
|
6 |
|
7 |
+
title = 'MarkupLM QA App'
|
8 |
+
#context = "There are three bridges in the county. Two of them are covered."
|
9 |
+
#question = "How many uncovered bridges are there?"
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
#Adapted from NielsRogge markuplm code samples and QA huggingface tutorial from Cloudera Fast Forward Labs
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
|
28 |
+
|
29 |
+
model = MarkupLMForQuestionAnswering.from_pretrained("FuriouslyAsleep/markuplm-large-finetuned-qa")
|
30 |
+
|
31 |
+
tokenizer = MarkupLMTokenizer(
|
32 |
+
vocab_file="vocab.json",
|
33 |
+
merges_file="merges.txt",
|
34 |
+
tags_dict= {"a": 0, "abbr": 1, "acronym": 2, "address": 3, "altGlyph": 4, "altGlyphDef": 5, "altGlyphItem": 6, "animate": 7, "animateColor": 8, "animateMotion": 9, "animateTransform": 10, "applet": 11, "area": 12, "article": 13, "aside": 14, "audio": 15, "b": 16, "base": 17, "basefont": 18, "bdi": 19, "bdo": 20, "bgsound": 21, "big": 22, "blink": 23, "blockquote": 24, "body": 25, "br": 26, "button": 27, "canvas": 28, "caption": 29, "center": 30, "circle": 31, "cite": 32, "clipPath": 33, "code": 34, "col": 35, "colgroup": 36, "color-profile": 37, "content": 38, "cursor": 39, "data": 40, "datalist": 41, "dd": 42, "defs": 43, "del": 44, "desc": 45, "details": 46, "dfn": 47, "dialog": 48, "dir": 49, "div": 50, "dl": 51, "dt": 52, "ellipse": 53, "em": 54, "embed": 55, "feBlend": 56, "feColorMatrix": 57, "feComponentTransfer": 58, "feComposite": 59, "feConvolveMatrix": 60, "feDiffuseLighting": 61, "feDisplacementMap": 62, "feDistantLight": 63, "feFlood": 64, "feFuncA": 65, "feFuncB": 66, "feFuncG": 67, "feFuncR": 68, "feGaussianBlur": 69, "feImage": 70, "feMerge": 71, "feMergeNode": 72, "feMorphology": 73, "feOffset": 74, "fePointLight": 75, "feSpecularLighting": 76, "feSpotLight": 77, "feTile": 78, "feTurbulence": 79, "fieldset": 80, "figcaption": 81, "figure": 82, "filter": 83, "font-face-format": 84, "font-face-name": 85, "font-face-src": 86, "font-face-uri": 87, "font-face": 88, "font": 89, "footer": 90, "foreignObject": 91, "form": 92, "frame": 93, "frameset": 94, "g": 95, "glyph": 96, "glyphRef": 97, "h1": 98, "h2": 99, "h3": 100, "h4": 101, "h5": 102, "h6": 103, "head": 104, "header": 105, "hgroup": 106, "hkern": 107, "hr": 108, "html": 109, "i": 110, "iframe": 111, "image": 112, "img": 113, "input": 114, "ins": 115, "kbd": 116, "keygen": 117, "label": 118, "legend": 119, "li": 120, "line": 121, "linearGradient": 122, "link": 123, "main": 124, "map": 125, "mark": 126, "marker": 127, "marquee": 128, "mask": 129, "math": 130, "menu": 131, "menuitem": 132, "meta": 133, "metadata": 134, "meter": 135, "missing-glyph": 136, "mpath": 137, "nav": 138, "nobr": 139, "noembed": 140, "noframes": 141, "noscript": 142, "object": 143, "ol": 144, "optgroup": 145, "option": 146, "output": 147, "p": 148, "param": 149, "path": 150, "pattern": 151, "picture": 152, "plaintext": 153, "polygon": 154, "polyline": 155, "portal": 156, "pre": 157, "progress": 158, "q": 159, "radialGradient": 160, "rb": 161, "rect": 162, "rp": 163, "rt": 164, "rtc": 165, "ruby": 166, "s": 167, "samp": 168, "script": 169, "section": 170, "select": 171, "set": 172, "shadow": 173, "slot": 174, "small": 175, "source": 176, "spacer": 177, "span": 178, "stop": 179, "strike": 180, "strong": 181, "style": 182, "sub": 183, "summary": 184, "sup": 185, "svg": 186, "switch": 187, "symbol": 188, "table": 189, "tbody": 190, "td": 191, "template": 192, "text": 193, "textPath": 194, "textarea": 195, "tfoot": 196, "th": 197, "thead": 198, "time": 199, "title": 200, "tr": 201, "track": 202, "tref": 203, "tspan": 204, "tt": 205, "u": 206, "ul": 207, "use": 208, "var": 209, "video": 210, "view": 211, "vkern": 212, "wbr": 213, "xmp": 214},
|
35 |
+
add_prefix_space=True,
|
36 |
+
)
|
37 |
+
|
38 |
+
#page_name_1 = "outputTables.html"
|
39 |
+
|
40 |
+
#with open(page_name_1) as f:
|
41 |
+
# single_html_string = f.read()
|
42 |
+
|
43 |
+
# test not batched
|
44 |
+
#encoding = tokenizer(
|
45 |
+
# single_html_string,
|
46 |
+
# padding="max_length",
|
47 |
+
# max_length=512,
|
48 |
+
# stride=128,
|
49 |
+
# truncation=True,
|
50 |
+
# return_overflowing_tokens=True,
|
51 |
+
# return_tensors="pt",
|
52 |
+
#)
|
53 |
+
|
54 |
+
#Show details of encoding
|
55 |
+
#for k, v in encoding.items():
|
56 |
+
# print(k, v.shape)
|
57 |
+
|
58 |
+
#More information prints
|
59 |
+
#print(encoding.input_ids)
|
60 |
+
#print(tokenizer.decode(encoding.input_ids[0].tolist()))
|
61 |
+
|
62 |
+
|
63 |
+
#Question Answering
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
#question_answerer = pipeline("question-answering")
|
70 |
#result = question_answerer(question = question, context=context)
|
71 |
#return result['answer']
|
72 |
|
73 |
|
74 |
+
#interface = gr.Interface.from_pipeline(question_answerer,
|
75 |
+
# title = title,
|
76 |
+
# theme = "grass",
|
77 |
+
# examples = [[context, question]]).launch()
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
def greet(name):
|
83 |
+
|
84 |
+
question = name
|
85 |
+
single_html_string = "<p>Grid Fin</p><table><tr><td><p>Technical Risk</p></td><td><p>Cost Risk</p></td><td><p>Schedule Risk</p></td></tr><tr><td><p>None</p></td><td><p>None</p></td><td><p>Weather delays </p></td></tr></table><p>Propulsion</p><table><tr><td><p>Technical Risk</p></td><td><p>Cost Risk</p></td><td><p>Schedule Risk</p></td></tr><tr><td><p>None</p></td><td><p>Materials overrun</p></td><td><p>None</p></td></tr></table>"
|
86 |
+
inputs = tokenizer.encode_plus(question, single_html_string, return_tensors="pt", padding="max_length", max_length=30, truncation=True)
|
87 |
+
|
88 |
+
answer_start_scores, answer_end_scores = model(**inputs, return_dict=False)
|
89 |
+
answer_start = torch.argmax(answer_start_scores) # get the most likely beginning of answer with the argmax of the score
|
90 |
+
answer_end = torch.argmax(answer_end_scores) + 1 # get the most likely end of answer with the argmax of the score
|
91 |
+
#print(single_html_string)
|
92 |
+
#print(answer_start)
|
93 |
+
#print('\n')
|
94 |
+
#print(answer_end)
|
95 |
+
|
96 |
+
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
|
97 |
+
#print(question)
|
98 |
+
answerForGradio = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
|
99 |
+
#print(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end])))
|
100 |
+
|
101 |
+
|
102 |
+
|
103 |
+
return answerForGradio
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
|
108 |
|
109 |
|
110 |
+
iface = gr.Interface(fn=greet, inputs="text", outputs="text")
|
111 |
+
iface.launch()
|
|
|
|