import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import WebDriverException
import openai
import time
import re
import replicate
import os
import io
import base64
import random
from PIL import Image
import pandas as pd
import gradio as gr
from io import BytesIO
import datetime
import pytz
import csv
import cv2
import tempfile
from io import StringIO
from huggingface_hub import (
create_repo,
get_full_repo_name,
upload_file,
)
def split_article(article_text):
words = article_text.split()
total_words = len(words)
split_points = [total_words // 4, total_words // 2, (3 * total_words) // 4]
first_quarter = ' '.join(words[:split_points[0]])
second_quarter = ' '.join(words[split_points[0]:split_points[1]])
third_quarter = ' '.join(words[split_points[1]:split_points[2]])
fourth_quarter = ' '.join(words[split_points[2]:])
return first_quarter, second_quarter, third_quarter, fourth_quarter
def replace_content(content, replacements):
for pattern, replacement in replacements.items():
content = re.sub(pattern, replacement, content)
return content
def generate_patterns(base_replacements):
patterns = {}
for key, value in base_replacements.items():
patterns[key] = value
patterns[key.capitalize()] = value.capitalize()
patterns[key.upper()] = value.upper()
patterns[key.lower()] = value.lower()
return patterns
base_replacements = {
'Layanan Pelanggan': 'Customer Service',
'Pusat Kontak': 'Contact Center',
'Multi Kanal': 'Omnichannel',
'Saluran Omni': 'Omnichannel',
'Merek':'Brand',
'Komputasi Awan':'Cloud Computing',
'Kecerdasan Buatan':'Artificial Intelligence',
'Pembelajaran Mesin':'Machine Learning',
'Alat Layanan Pelanggan':'Customer Service Tools',
'Pengalaman Pelanggan':'Customer Experience',
'AI Percakapan':'AI Conversation',
'Aplikasi pesan':'Message app',
'Visi Komputer':'Computer Vision'
}
def get_openai_response(messages, api_key):
openai.api_key = api_key
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages,
temperature=0
)
finish_reason = response['choices'][0]['finish_reason']
if finish_reason == 'length' or finish_reason == 'stop':
return response['choices'][0]['message']['content']
def get_azure_response(messages, api_key, azure_api_base):
openai.api_type = "azure"
openai.api_version = "2023-05-15"
openai.api_base = azure_api_base
openai.api_key = api_key
response = openai.ChatCompletion.create(
engine="gpt-35-turbo",
messages=messages,
temperature = 0
)
finish_reason = response['choices'][0]['finish_reason']
if finish_reason == 'length' or finish_reason == 'stop':
return response['choices'][0]['message']['content']
def cek_url(url):
with open("log_activity.txt", 'r') as file:
scraped_urls = set(url.strip() for url in file.readlines())
if url in scraped_urls:
return True
else:
scraped_urls.add(url)
return False
def scrap_portal(queri):
api_key = 'AIzaSyDJUWVZG2oHkHSsYoqdqgUZwQC2Aa2kSok'
search_engine_id = 'a0dc878459ceb4811'
num_pages = 3
type = random.choice([' articles',' news',' trends',' technologies', ' future'])
link = []
query = queri + type
for page in range(num_pages):
start_index = page * 10 + 1
url = f'https://www.googleapis.com/customsearch/v1?key={api_key}&cx={search_engine_id}&q={query}&start={start_index}'
response = requests.get(url)
if response.status_code == 200:
data = response.json()
for item in data['items']:
url = item['link']
link.append(url)
else:
print(f"Permintaan halaman {page + 1} gagal. Kode status:", response.status_code)
filter_link1 = [url for url in link if "categories" not in url and "tags" not in url]
filter_link2 = [url for url in filter_link1 if "help" not in url]
return filter_link2
def clean_scrap(artikel,models,api_key,azure_api_base,keyword):
new_artikel = []
article = []
if len(artikel) > 1:
for art in artikel:
messages=[
{"role": "system", "content": "You are a very professional article editor."},
{"role": "user", "content": "I have a raw article that contains a lot of unnecessary data such as ads, website information, and article publishers, as well as links to other pages, and so on. Please clean up the article I provided so that only the article's content remains. \nThen, you should also summarize the article so that it does not exceed 5000 characters" + art + "\nDo not write any explanation and any pleasantries. Please use the following complete format to display the output: {the cleaned and summarized article's content}"}
]
if models == 'openai':
result = get_openai_response(messages,api_key)
time.sleep(2)
print(result)
new_artikel.append(result)
else:
result = get_azure_response(messages,api_key,azure_api_base)
time.sleep(2)
new_artikel.append(result)
else:
for art in artikel:
messages=[
{"role": "system", "content": "You are a very professional article editor."},
{"role": "user", "content": "I have a raw article that contains a lot of unnecessary data such as ads, website information, and article publishers, as well as links to other pages, and so on. Please clean up the article I provided so that only the article's content remains." + art + "\nDo not write any explanation and any pleasantries. Please use the following complete format to display the output: {the cleaned article's content}"}
]
if models == 'openai':
result = get_openai_response(messages,api_key)
time.sleep(2)
print(result)
new_artikel.append(result)
else:
result = get_azure_response(messages,api_key,azure_api_base)
time.sleep(2)
new_artikel.append(result)
new_art = [' '.join(new_artikel)]
for art in new_art:
messages=[
{"role": "system", "content": "You are a very professional article editor and capable of generating compelling and professional article titles."},
{"role": "user", "content": "Paraphrase the above article to make it a well-written and easily understandable piece for humans, following the conventions of renowned articles. \nThen, You Must Generate a title that is appropriate for the article I provided. The title should be professional, similar to typical article titles and sound more natural for a human to read" + art + "\nDo not write any explanation and any pleasantries. Please use the following complete format to display the output: title:{title}, article: {new paraphrased article}"}
]
if models == 'openai':
result = get_openai_response(messages,api_key)
article.append(result)
time.sleep(2)
else:
result = get_azure_response(messages,api_key,azure_api_base)
article.append(result)
time.sleep(2)
content = article[0].split("\n")
title = content[0].replace('title:', '').replace("Title:", '').strip()
messages=[
{"role": "system", "content": "You are a professional translator and rewriter"},
{"role": "user", "content": f"Please translate and rewrite this sentence into Indonesian language with the following requirements: \n1. The sentence should be concise, compact, and clear. \n2. The sentence length should not exceed 50 characters. \n3. The sentences should be professional, similar to typical article titles and sound more natural for a human to read. \n4. fokus keyword menggunakan keyword {keyword} harus ada di awal judul. \n5. Gaya Penulisan judul artikel seperti gaya forbes. \n6. Menggunakan bahasa indonesia yag mudah dipahami/familiar oleh manusia , :" +title+"\nDo not write any explanation and any pleasantries. Please use the following complete format to display the output: Judul:{hasil rewrite}"}
]
if models == 'openai':
judul = get_openai_response(messages,api_key)
else:
judul = get_azure_response(messages,api_key,azure_api_base)
judul = judul.replace("Judul:", '').strip()
judul = judul.replace("Title:", '').strip()
try:
replacements = generate_patterns(base_replacements)
judul = replace_content(judul, replacements)
except:
judul = judul
contents = content[1:]
contents = [' '.join(contents).replace("article:", '').replace("Article:", '').strip()]
return title, judul, contents
def scrap_artikel(source_type,source,models,api_key,azure_api_base,keyword):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
options.add_argument(f"user-agent={user_agent}")
wd = webdriver.Chrome(options=options)
if source_type == "keyword":
artikel =[]
URL = ""
link = scrap_portal(source)
for url in link:
if cek_url(url):
continue
else:
if len(artikel) >=1:
continue
wd.get(url)
wd.find_element(By.CSS_SELECTOR, 'body').send_keys(Keys.CONTROL, Keys.END)
time.sleep(1)
raw_html = wd.find_element(By.TAG_NAME, 'body').get_attribute('innerHTML')
wd.quit()
soup_html = BeautifulSoup(raw_html, "html.parser")
containers = soup_html.findAll('p')
for paragraph in containers:
artic=paragraph.get_text()
artikel.append(artic)
URL = URL + url
paragraf = ' '.join(artikel)
if len(paragraf)>= 18000:
part1, part2, part3, part4 = split_article(paragraf)
artikels = [part1, part2, part3, part4]
else :
artikels = [paragraf]
title, judul, contents = clean_scrap(artikels,models,api_key,azure_api_base,keyword)
return title, judul, URL, contents
else:
wd.get(source)
wd.find_element(By.CSS_SELECTOR, 'body').send_keys(Keys.CONTROL, Keys.END)
time.sleep(1)
raw_html = wd.find_element(By.TAG_NAME, 'body').get_attribute('innerHTML')
wd.quit()
soup_html = BeautifulSoup(raw_html, "html.parser")
containers = soup_html.findAll('p')
artikel =[]
for paragraph in containers:
artic=paragraph.get_text()
artikel.append(artic)
paragraf = ' '.join(artikel)
if len(paragraf)>= 18000:
part1, part2, part3, part4 = split_article(paragraf)
artikels = [part1, part2, part3, part4]
else :
artikels = [paragraf]
title, judul, contents = clean_scrap(artikels,models,api_key,azure_api_base,keyword)
return title, judul, source, contents
def artikel_processing(source_type,source,backlink,keyword,models,api_key,azure_api_base,replicate_key):
title, judul, url, artikel= scrap_artikel(source_type,source, models, api_key,azure_api_base,keyword)
translated = []
optimized = []
edited_format = []
article = []
post_article = []
for i in artikel:
messages=[
{"role": "system", "content": "You are a proficient English to Indonesian language translator machine. You are capable of translating professionally according to the rules of the Indonesian language"},
{"role": "user", "content": "Translate the following article into Indonesian language. Then, you must resume the article translated. The translated result should be more than 2500 characters and less than 7000 characters.: " + i + "\nDo not write any explanation and any pleasantries. Please use the following complete format to display the output: {Professionally rewritten content}"}
]
if models == 'openai':
translate = get_openai_response(messages,api_key)
translated.append(translate)
time.sleep(2)
else:
translate = get_azure_response(messages,api_key,azure_api_base)
translated.append(translate)
time.sleep(2)
for i in translated:
messages=[
{"role": "system", "content": f"""
You are a professional article writer and editor. I have an article that needs your editing expertise to align its writing style with specific instructions and guidelines:
1. Theme and Title: The blog article should have a clear and informative title that reflects the main topic.
2. Writing Style: The writing style in the blog article should appear serious, informative, and academic. It should use formal language to convey the importance of the discussed topic. Sentences should be long and rich in information.
3. Use of Data and Statistics: The blog article should support its arguments with concrete data and statistics.
4. Tone and Emotion: Despite the seriousness of the topic, the blog should not be overly emotional in its delivery. You should focus more on presenting facts and analysis rather than creating an emotional effect.
5. Subheadings: The use of subheadings should help readers follow the flow of the article and understand key points more easily.
6. Citations and Sources: The blog should cite reliable sources.
7. Graphics: The blog should use graphics to visualize data clearly.
8. SEO Keywords: Use keyword {keyword} that will help the blog become more discoverable in search results.
9. Conclusion: The blog should also have a clear conclusion that summarizes the core findings of the study.
10. Final Thought: You should conclude the blog by providing readers with broader insights on the topic.
11. meta decription max 160 character
12. sentences max 20 words
13. paragraph max 300 words
14. focus keyword {keyword} harus ada di content
15. focus keyword {keyword} harus ada di intro
16. focus keyword {keyword} harus ada di meta desccripton
17. focus keyword {keyword} harus ada di url
18. focus keyword {keyword} harus ada di intro
Here is the article that you need to edit to adhere to these 18 criteria: {i}
Please do not change the existing format in the article, just adjust the writing style according to the 10 criteria I mentioned.
""" },
{"role": "user", "content": "Please ensure the usage of proper and correct Indonesian language. \nDo not write any explanation and any pleasantries. Provide only the rewrited article using this format: {rewrited article}"}
]
if models == 'openai':
result = get_openai_response(messages,api_key)
article.append(result)
time.sleep(2)
else:
result = get_azure_response(messages,api_key,azure_api_base)
article.append(result)
time.sleep(2)
for i in article:
messages=[
{"role": "system", "content": "You are a professional article editor machine."},
{"role": "user", "content": "Please rewrite the given article in the style of a professional writer for Forbes or The New York Times with bahasa indonesia as your native language:\n\n" + i + "\nAdd underline tags and bold tags to all foreign terms (non-Indonesian words) you encounter. You only have less than 7 attempts to do this, no more than that in order to keep the article neat and clean. \nThen, You must divide the article into several paragraphs, no less than 3 paragraphs. kamu juga harus membuat subheading menggunakan pada setiap sub topik pembahasan \n\nPlease ensure the usage of proper and correct Indonesian language. \nDo not write any explanation and any pleasantries. Provide only the reformatted article using this format:
A brief headline of the article content
reformatted article
"} ] if models == 'openai': font_formatted = get_openai_response(messages,api_key) edited_format.append(font_formatted) time.sleep(2) else: font_formatted = get_azure_response(messages,api_key,azure_api_base) edited_format.append(font_formatted) time.sleep(2) for i in edited_format: messages=[ {"role": "system", "content": "You are a professional article editor machine."}, {"role": "user", "content": "Please edit the given article:\n" + "\n" + i + f"\nAdd 3 annotations (Maximum) to the words with the keywords {keyword} to format them as links in the HTML structure.the link should be connected to {backlink} \nThe format should be like this: {keyword}. YOU MUST Do this FORMAT ONLY for the first 3 keywords that appear and MUST be on different keywords, IF a keyword appears more than twice then simply ignored it by not adding any links to those keywords. Do not combine two keyword into one or modify any keyword. You only have less than 3 attempts to do this, no more than that in order to keep the article neat and clean. \nExcept for the terms {keyword} you are prohibited from providing backlinks. Additionally, you are not allowed to include backlinks to individuals' names or technology company names such as Google, Microsoft, and others. \nYou only have less than 3 attempts to do this, no more than that in order to keep the article neat and clean.\nPlease ensure the usage of proper and correct Indonesian language. \nDo not write any explanation and any pleasantries."+"Provide only the reformatted article using this format: {new_formatted_article}"} ] if models == 'openai': artikel_post = get_openai_response(messages,api_key) post_article.append(artikel_post ) time.sleep(2) else: artikel_post = get_azure_response(messages,api_key,azure_api_base) post_article.append(artikel_post ) time.sleep(2) meta_keywords = '