Recipe scraper fixes

This commit is contained in:
Vicnet
2023-11-09 11:24:27 +01:00
parent 280ae5a261
commit 03f821c1ba
6 changed files with 212 additions and 78 deletions

View File

@@ -33,9 +33,9 @@
A70D7CA12AC73CA800D53DBF /* RecipeEditView.swift in Sources */ = {isa = PBXBuildFile; fileRef = A70D7CA02AC73CA700D53DBF /* RecipeEditView.swift */; };
A70D7CA32AC74B3B00D53DBF /* DateExtension.swift in Sources */ = {isa = PBXBuildFile; fileRef = A70D7CA22AC74B3B00D53DBF /* DateExtension.swift */; };
A74D33BE2AF82AAE00D06555 /* SwiftSoup in Frameworks */ = {isa = PBXBuildFile; productRef = A74D33BD2AF82AAE00D06555 /* SwiftSoup */; };
A74D33C32AFCD1C300D06555 /* RecipeScraper.swift in Sources */ = {isa = PBXBuildFile; fileRef = A74D33C22AFCD1C300D06555 /* RecipeScraper.swift */; };
A76B8A6F2ADFFA8800096CEC /* SupportedLanguage.swift in Sources */ = {isa = PBXBuildFile; fileRef = A76B8A6E2ADFFA8800096CEC /* SupportedLanguage.swift */; };
A76B8A712AE002AE00096CEC /* AlertHandler.swift in Sources */ = {isa = PBXBuildFile; fileRef = A76B8A702AE002AE00096CEC /* AlertHandler.swift */; };
A781E7612AF822D000452F6F /* RecipeScraper.swift in Sources */ = {isa = PBXBuildFile; fileRef = A781E7602AF822CF00452F6F /* RecipeScraper.swift */; };
A7AEAE642AD5521400135378 /* Localizable.xcstrings in Resources */ = {isa = PBXBuildFile; fileRef = A7AEAE632AD5521400135378 /* Localizable.xcstrings */; };
A7F3F8E82ACBFC760076C227 /* KeywordPickerView.swift in Sources */ = {isa = PBXBuildFile; fileRef = A7F3F8E72ACBFC760076C227 /* KeywordPickerView.swift */; };
A7F3F8EA2ACC221C0076C227 /* CategoryPickerView.swift in Sources */ = {isa = PBXBuildFile; fileRef = A7F3F8E92ACC221C0076C227 /* CategoryPickerView.swift */; };
@@ -88,10 +88,10 @@
A703226E2ABB1DD700D7C4ED /* ColorExtension.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ColorExtension.swift; sourceTree = "<group>"; };
A70D7CA02AC73CA700D53DBF /* RecipeEditView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = RecipeEditView.swift; sourceTree = "<group>"; };
A70D7CA22AC74B3B00D53DBF /* DateExtension.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DateExtension.swift; sourceTree = "<group>"; };
A74D33BF2AF82CB500D06555 /* Scraper.playground */ = {isa = PBXFileReference; lastKnownFileType = file.playground; path = Scraper.playground; sourceTree = "<group>"; xcLanguageSpecificationIdentifier = xcode.lang.swift; };
A74D33BF2AF82CB500D06555 /* TestScraper.playground */ = {isa = PBXFileReference; lastKnownFileType = file.playground; path = TestScraper.playground; sourceTree = "<group>"; xcLanguageSpecificationIdentifier = xcode.lang.swift; };
A74D33C22AFCD1C300D06555 /* RecipeScraper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = RecipeScraper.swift; sourceTree = "<group>"; };
A76B8A6E2ADFFA8800096CEC /* SupportedLanguage.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SupportedLanguage.swift; sourceTree = "<group>"; };
A76B8A702AE002AE00096CEC /* AlertHandler.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AlertHandler.swift; sourceTree = "<group>"; };
A781E7602AF822CF00452F6F /* RecipeScraper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = RecipeScraper.swift; sourceTree = "<group>"; };
A7AEAE632AD5521400135378 /* Localizable.xcstrings */ = {isa = PBXFileReference; lastKnownFileType = text.json.xcstrings; path = Localizable.xcstrings; sourceTree = "<group>"; };
A7F3F8E72ACBFC760076C227 /* KeywordPickerView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = KeywordPickerView.swift; sourceTree = "<group>"; };
A7F3F8E92ACC221C0076C227 /* CategoryPickerView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CategoryPickerView.swift; sourceTree = "<group>"; };
@@ -255,8 +255,8 @@
A781E75F2AF8228100452F6F /* RecipeImport */ = {
isa = PBXGroup;
children = (
A781E7602AF822CF00452F6F /* RecipeScraper.swift */,
A74D33BF2AF82CB500D06555 /* Scraper.playground */,
A74D33BF2AF82CB500D06555 /* TestScraper.playground */,
A74D33C22AFCD1C300D06555 /* RecipeScraper.swift */,
);
path = RecipeImport;
sourceTree = "<group>";
@@ -421,7 +421,7 @@
A703226A2ABAF49800D7C4ED /* JSONCoderExtension.swift in Sources */,
A703226D2ABAF90D00D7C4ED /* APIController.swift in Sources */,
A70171822AA8E71900064C43 /* Nextcloud_Cookbook_iOS_ClientApp.swift in Sources */,
A781E7612AF822D000452F6F /* RecipeScraper.swift in Sources */,
A74D33C32AFCD1C300D06555 /* RecipeScraper.swift in Sources */,
A70171AD2AA8EF4700064C43 /* MainViewModel.swift in Sources */,
A76B8A6F2ADFFA8800096CEC /* SupportedLanguage.swift in Sources */,
A70171C92AB4CBB400064C43 /* OnboardingView.swift in Sources */,

View File

@@ -51,6 +51,27 @@
<key>orderHint</key>
<integer>4</integer>
</dict>
<key>TestScraper (Playground) 1.xcscheme</key>
<dict>
<key>isShown</key>
<false/>
<key>orderHint</key>
<integer>8</integer>
</dict>
<key>TestScraper (Playground) 2.xcscheme</key>
<dict>
<key>isShown</key>
<false/>
<key>orderHint</key>
<integer>9</integer>
</dict>
<key>TestScraper (Playground).xcscheme</key>
<dict>
<key>isShown</key>
<false/>
<key>orderHint</key>
<integer>7</integer>
</dict>
</dict>
</dict>
</plist>

View File

@@ -2,14 +2,14 @@
// RecipeScraper.swift
// Nextcloud Cookbook iOS Client
//
// Created by Vincent Meilinger on 05.11.23.
// Created by Vincent Meilinger on 09.11.23.
//
import Foundation
import SwiftSoup
class RecipeScraper {
func scrape(url: String) -> RecipeDetail? {
func scrape(url: String) throws -> RecipeDetail? {
var contents: String? = nil
if let url = URL(string: url) {
do {
@@ -26,42 +26,87 @@ class RecipeScraper {
print("ERROR: no contents")
exit(1)
}
let doc = try SwiftSoup.parse(html)
let doc: Document = try SwiftSoup.parse(html)
let elements: Elements = try doc.select("script")
for elem in elements.array() {
for attr in elem.getAttributes()!.asList() {
if attr.getValue() == "application/ld+json" {
toDict(elem)
guard let dict = toDict(elem) else { continue }
return getRecipe(fromDict: dict)
}
}
}
return nil
}
private func toDict(_ elem: Element) -> [String: Any] {
private func toDict(_ elem: Element) -> [String: Any]? {
var recipeDict: [String: Any]? = nil
do {
let jsonString = try elem.html()
//print(json)
let json = try JSONSerialization.jsonObject(with: jsonString.data(using: .utf8)!, options: .fragmentsAllowed)
if let recipe = json as? [String : Any] {
return recipe
recipeDict = recipe
} else if let recipe = (json as! [Any])[0] as? [String : Any] {
return recipe
recipeDict = recipe
}
} catch {
print("COULD NOT DECODE")
print("Unable to decode json")
return nil
}
guard let recipeDict = recipeDict else {
print("Json is not a dict")
return nil
}
if recipeDict["@type"] as? String ?? "" == "Recipe" {
return recipeDict
} else if (recipeDict["@type"] as? [String] ?? []).contains("Recipe") {
return recipeDict
} else {
print("Json dict is not a recipe ...")
return nil
}
}
private func getRecipe(fromDict recipe: Dictionary<String, Any>) {
if recipe["@type"] as? String ?? "" == "Recipe" {
print(recipe["name"] ?? "No name")
print(recipe["recipeIngredient"] ?? "No ingredients")
print(recipe["recipeInstruction"] ?? "No instruction")
} else if (recipe["@type"] as? [String] ?? []).contains("Recipe") {
print(recipe["name"] ?? "No name")
}
private func getRecipe(fromDict recipe: Dictionary<String, Any>) -> RecipeDetail? {
var recipeDetail = RecipeDetail()
recipeDetail.name = recipe["name"] as? String ?? "New Recipe"
recipeDetail.recipeCategory = recipe["recipeCategory"] as? String ?? ""
recipeDetail.keywords = recipe["keywords"] as? String ?? ""
recipeDetail.description = recipe["description"] as? String ?? ""
recipeDetail.dateCreated = recipe["dateCreated"] as? String ?? ""
recipeDetail.dateModified = recipe["dateModified"] as? String ?? ""
recipeDetail.imageUrl = recipe["imageUrl"] as? String ?? ""
recipeDetail.url = recipe["url"] as? String ?? ""
recipeDetail.cookTime = recipe["cookTime"] as? String ?? ""
recipeDetail.prepTime = recipe["prepTime"] as? String ?? ""
recipeDetail.totalTime = recipe["totalTime"] as? String ?? ""
recipeDetail.recipeInstructions = stringArrayForKey("recipeInstructions", dict: recipe)
recipeDetail.recipeYield = recipe["recipeYield"] as? Int ?? 0
recipeDetail.recipeIngredient = recipe["recipeIngredient"] as? [String] ?? []
recipeDetail.tool = recipe["tool"] as? [String] ?? []
recipeDetail.nutrition = recipe["nutrition"] as? [String:String] ?? [:]
return recipeDetail
}
private func stringArrayForKey(_ key: String, dict: Dictionary<String, Any>) -> [String] {
if let value = dict[key] as? [String] {
return value
} else if let orderedList = dict[key] as? [Any] {
var entries: [String] = []
for dict in orderedList {
guard let dict = dict as? [String: Any] else { continue }
guard let text = dict["text"] as? String else { continue }
entries.append(text)
}
return entries
}
return []
}
}

View File

@@ -5,59 +5,5 @@ import Foundation
//let url = "https://www.chefkoch.de/rezepte/1385981243676608/Knusprige-Entenbrust.html"
let url = "https://www.allrecipes.com/recipe/234620/mascarpone-mashed-potatoes/"
var contents: String? = nil
if let url = URL(string: url) {
do {
contents = try String(contentsOf: url)
//print(contents)
} catch {
print("ERROR: Could not load url content.")
}
} else {
print("ERROR: Bad url.")
}
guard let html = contents else {
print("ERROR: no contents")
exit(1)
}
let doc: Document = try SwiftSoup.parse(html)
let elements: Elements = try doc.select("script")
for elem in elements.array() {
for attr in elem.getAttributes()!.asList() {
//print(attr.getValue())
if attr.getValue() == "application/ld+json" {
do {
let jsonString = try elem.html()
//print(json)
let json = try JSONSerialization.jsonObject(with: jsonString.data(using: .utf8)!, options: .fragmentsAllowed)
if let recipe = json as? [String : Any] {
print("1")
getRecipe(fromDict: recipe)
} else if let recipe = (json as! [Any])[0] as? [String : Any] {
print("2")
getRecipe(fromDict: recipe)
}
} catch {
print("COULD NOT DECODE")
}
}
}
}
func getRecipe(fromDict recipe: Dictionary<String, Any>) {
if recipe["@type"] as? String ?? "" == "Recipe" {
print(recipe["name"] ?? "No name")
print(recipe["recipeIngredient"] ?? "No ingredients")
print(recipe["recipeInstruction"] ?? "No instruction")
} else if (recipe["@type"] as? [String] ?? []).contains("Recipe") {
print(recipe["name"] ?? "No name")
}
}
let scraper = RecipeScaper()

View File

@@ -0,0 +1,118 @@
import SwiftSoup
import Foundation
class RecipeScraper {
func scrape(url: String) throws -> RecipeDetail? {
var contents: String? = nil
if let url = URL(string: url) {
do {
contents = try String(contentsOf: url)
} catch {
print("ERROR: Could not load url content.")
}
} else {
print("ERROR: Bad url.")
}
guard let html = contents else {
print("ERROR: no contents")
exit(1)
}
let doc = try SwiftSoup.parse(html)
let elements: Elements = try doc.select("script")
for elem in elements.array() {
for attr in elem.getAttributes()!.asList() {
if attr.getValue() == "application/ld+json" {
guard let dict = toDict(elem) else { continue }
return getRecipe(fromDict: dict)
}
}
}
return nil
}
private func toDict(_ elem: Element) -> [String: Any]? {
var recipeDict: [String: Any]? = nil
do {
let jsonString = try elem.html()
//print(json)
let json = try JSONSerialization.jsonObject(with: jsonString.data(using: .utf8)!, options: .fragmentsAllowed)
if let recipe = json as? [String : Any] {
recipeDict = recipe
} else if let recipe = (json as! [Any])[0] as? [String : Any] {
recipeDict = recipe
}
} catch {
print("Unable to decode json")
return nil
}
guard let recipeDict = recipeDict else {
print("Json is not a dict")
return nil
}
if recipeDict["@type"] as? String ?? "" == "Recipe" {
return recipeDict
} else if (recipeDict["@type"] as? [String] ?? []).contains("Recipe") {
return recipeDict
} else {
print("Json dict is not a recipe ...")
return nil
}
}
private func getRecipe(fromDict recipe: Dictionary<String, Any>) -> RecipeDetail? {
var recipeDetail = RecipeDetail()
recipeDetail.name = recipe["name"] as? String ?? "New Recipe"
recipeDetail.recipeCategory = recipe["recipeCategory"] as? String ?? ""
recipeDetail.keywords = recipe["keywords"] as? String ?? ""
recipeDetail.description = recipe["description"] as? String ?? ""
recipeDetail.dateCreated = recipe["dateCreated"] as? String ?? ""
recipeDetail.dateModified = recipe["dateModified"] as? String ?? ""
recipeDetail.imageUrl = recipe["imageUrl"] as? String ?? ""
recipeDetail.url = recipe["url"] as? String ?? ""
recipeDetail.cookTime = recipe["cookTime"] as? String ?? ""
recipeDetail.prepTime = recipe["prepTime"] as? String ?? ""
recipeDetail.totalTime = recipe["totalTime"] as? String ?? ""
recipeDetail.recipeInstructions = stringArrayForKey("recipeInstructions", dict: recipe)
recipeDetail.recipeYield = recipe["recipeYield"] as? Int ?? 0
recipeDetail.recipeIngredient = recipe["recipeIngredient"] as? [String] ?? []
recipeDetail.tool = recipe["tool"] as? [String] ?? []
recipeDetail.nutrition = recipe["nutrition"] as? [String:String] ?? [:]
return recipeDetail
}
private func stringArrayForKey(_ key: String, dict: Dictionary<String, Any>) -> [String] {
if let value = dict[key] as? [String] {
return value
} else if let orderedList = dict[key] as? [Any] {
var entries: [String] = []
for dict in orderedList {
guard let dict = dict as? [String: Any] else { continue }
guard let text = dict["text"] as? String else { continue }
entries.append(text)
}
return entries
}
return []
}
}
//let url = "https://www.chefkoch.de/rezepte/1385981243676608/Knusprige-Entenbrust.html"
let url = "https://www.allrecipes.com/recipe/234620/mascarpone-mashed-potatoes/"
let scraper = RecipeScraper()
do {
let recipe = try scraper.scrape(url: url)
print(recipe)
} catch {
print("No recipe on this website found.")
}

View File

@@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<playground version='5.0' target-platform='ios' buildActiveScheme='true' importAppTypes='true'>
<timeline fileName='timeline.xctimeline'/>
</playground>