Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -129,7 +129,7 @@ def scrap_portal(queri):
|
|
129 |
filter_link2 = [url for url in filter_link1 if "help" not in url]
|
130 |
return filter_link2
|
131 |
|
132 |
-
def clean_scrap(artikel,
|
133 |
new_artikel = []
|
134 |
article = []
|
135 |
if len(artikel) > 1:
|
@@ -201,7 +201,7 @@ def clean_scrap(artikel,link,models,api_key,azure_api_base,keyword):
|
|
201 |
contents = content[1:]
|
202 |
contents = [' '.join(contents).replace("article:", '').replace("Article:", '').strip()]
|
203 |
|
204 |
-
return title, judul,
|
205 |
|
206 |
def scrap_artikel(source_type,source,models,api_key,azure_api_base,keyword):
|
207 |
options = webdriver.ChromeOptions()
|
@@ -216,6 +216,7 @@ def scrap_artikel(source_type,source,models,api_key,azure_api_base,keyword):
|
|
216 |
|
217 |
if source_type == "keyword":
|
218 |
artikel =[]
|
|
|
219 |
link = scrap_portal(source)
|
220 |
for url in link:
|
221 |
if cek_url(url):
|
@@ -236,16 +237,17 @@ def scrap_artikel(source_type,source,models,api_key,azure_api_base,keyword):
|
|
236 |
for paragraph in containers:
|
237 |
artic=paragraph.get_text()
|
238 |
artikel.append(artic)
|
|
|
239 |
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
|
248 |
-
|
249 |
|
250 |
else:
|
251 |
wd.get(source)
|
@@ -270,9 +272,9 @@ def scrap_artikel(source_type,source,models,api_key,azure_api_base,keyword):
|
|
270 |
artikels = [part1, part2, part3, part4]
|
271 |
else :
|
272 |
artikels = [paragraf]
|
273 |
-
title, judul,
|
274 |
|
275 |
-
return title, judul,
|
276 |
|
277 |
def artikel_processing(source_type,source,backlink,keyword,models,api_key,azure_api_base,replicate_key):
|
278 |
title, judul, url, artikel= scrap_artikel(source_type,source, models, api_key,azure_api_base,keyword)
|
@@ -486,29 +488,12 @@ def artikel_processing(source_type,source,backlink,keyword,models,api_key,azure_
|
|
486 |
os.remove(tmp_path)
|
487 |
except:
|
488 |
image = Image.open('botika_logo.jpeg')
|
489 |
-
|
490 |
-
w,h = image.size
|
491 |
-
new_w = int(w/1.641)
|
492 |
-
new_h = int(h/1.641)
|
493 |
-
image = image.resize((new_w, new_h),Image.ANTIALIAS)
|
494 |
-
tmp_path = "image.png"
|
495 |
-
image.save(tmp_path)
|
496 |
-
with open(tmp_path, 'rb') as open_file:
|
497 |
-
byte_img = open_file.read()
|
498 |
-
base64_bytes = base64.b64encode(byte_img)
|
499 |
-
base64_string = base64_bytes.decode('utf-8')
|
500 |
-
base64_string = base64.b64decode(base64_string)
|
501 |
-
image_data= base64_string
|
502 |
-
os.remove(tmp_path)
|
503 |
return judul,content,image,image_data,url
|
504 |
|
505 |
def scrap(source_type,source,backlink,keyword,version,api_key,azure_api_base,replicate_key):
|
506 |
# try:
|
507 |
-
judul,kontent,gambar,
|
508 |
-
title = '<h1>'+judul+'</h1>'
|
509 |
-
desired_timezone = pytz.timezone('Asia/Jakarta')
|
510 |
-
current_time = datetime.datetime.now(desired_timezone)
|
511 |
-
Timestamp = current_time.strftime('%Y-%m-%d %H:%M:%S')
|
512 |
|
513 |
with open("judul.txt", "w") as file:
|
514 |
file.write(judul)
|
@@ -548,8 +533,7 @@ def scrap(source_type,source,backlink,keyword,version,api_key,azure_api_base,rep
|
|
548 |
|
549 |
with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
|
550 |
temp_file.write(combined_data)
|
551 |
-
|
552 |
-
|
553 |
repo_name = get_full_repo_name(model_id="Article_Gen4", token="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")
|
554 |
file_url = upload_file(
|
555 |
path_or_fileobj=temp_file.name,
|
@@ -561,6 +545,7 @@ def scrap(source_type,source,backlink,keyword,version,api_key,azure_api_base,rep
|
|
561 |
status = "<h3>Berhasil Generate Artikel</h3>"
|
562 |
time.sleep(60)
|
563 |
return status,gambar
|
|
|
564 |
else:
|
565 |
with open('log_activity.txt', 'r') as file:
|
566 |
existing_data = file.read()
|
|
|
129 |
filter_link2 = [url for url in filter_link1 if "help" not in url]
|
130 |
return filter_link2
|
131 |
|
132 |
+
def clean_scrap(artikel,models,api_key,azure_api_base,keyword):
|
133 |
new_artikel = []
|
134 |
article = []
|
135 |
if len(artikel) > 1:
|
|
|
201 |
contents = content[1:]
|
202 |
contents = [' '.join(contents).replace("article:", '').replace("Article:", '').strip()]
|
203 |
|
204 |
+
return title, judul, contents
|
205 |
|
206 |
def scrap_artikel(source_type,source,models,api_key,azure_api_base,keyword):
|
207 |
options = webdriver.ChromeOptions()
|
|
|
216 |
|
217 |
if source_type == "keyword":
|
218 |
artikel =[]
|
219 |
+
URL = None
|
220 |
link = scrap_portal(source)
|
221 |
for url in link:
|
222 |
if cek_url(url):
|
|
|
237 |
for paragraph in containers:
|
238 |
artic=paragraph.get_text()
|
239 |
artikel.append(artic)
|
240 |
+
URL = URL + url
|
241 |
|
242 |
+
paragraf = ' '.join(artikel)
|
243 |
+
if len(paragraf)>= 18000:
|
244 |
+
part1, part2, part3, part4 = split_article(paragraf)
|
245 |
+
artikels = [part1, part2, part3, part4]
|
246 |
+
else :
|
247 |
+
artikels = [paragraf]
|
248 |
+
title, judul, contents = clean_scrap(artikels,models,api_key,azure_api_base,keyword)
|
249 |
|
250 |
+
return title, judul, URL, contents
|
251 |
|
252 |
else:
|
253 |
wd.get(source)
|
|
|
272 |
artikels = [part1, part2, part3, part4]
|
273 |
else :
|
274 |
artikels = [paragraf]
|
275 |
+
title, judul, contents = clean_scrap(artikels,models,api_key,azure_api_base,keyword)
|
276 |
|
277 |
+
return title, judul, source, contents
|
278 |
|
279 |
def artikel_processing(source_type,source,backlink,keyword,models,api_key,azure_api_base,replicate_key):
|
280 |
title, judul, url, artikel= scrap_artikel(source_type,source, models, api_key,azure_api_base,keyword)
|
|
|
488 |
os.remove(tmp_path)
|
489 |
except:
|
490 |
image = Image.open('botika_logo.jpeg')
|
491 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
492 |
return judul,content,image,image_data,url
|
493 |
|
494 |
def scrap(source_type,source,backlink,keyword,version,api_key,azure_api_base,replicate_key):
|
495 |
# try:
|
496 |
+
judul,kontent,gambar,image_data,url= artikel_processing(source_type,source,backlink,keyword,version,api_key,azure_api_base,replicate_key)
|
|
|
|
|
|
|
|
|
497 |
|
498 |
with open("judul.txt", "w") as file:
|
499 |
file.write(judul)
|
|
|
533 |
|
534 |
with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
|
535 |
temp_file.write(combined_data)
|
536 |
+
|
|
|
537 |
repo_name = get_full_repo_name(model_id="Article_Gen4", token="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")
|
538 |
file_url = upload_file(
|
539 |
path_or_fileobj=temp_file.name,
|
|
|
545 |
status = "<h3>Berhasil Generate Artikel</h3>"
|
546 |
time.sleep(60)
|
547 |
return status,gambar
|
548 |
+
|
549 |
else:
|
550 |
with open('log_activity.txt', 'r') as file:
|
551 |
existing_data = file.read()
|